From 9e043711206d047ddc8faf0ada22208e87defe9e Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Mon, 28 Jan 2019 10:46:59 -0700
Subject: [PATCH 1/6] Initial work moving in BFS

---
 CMakeLists.txt                                |    2 +
 external/cub/CHANGE_LOG.TXT                   |  381 +
 external/cub/LICENSE.TXT                      |   24 +
 external/cub/README.md                        |  128 +
 external/cub/common.mk                        |  233 +
 external/cub/cub/agent/agent_histogram.cuh    |  787 ++
 .../cub/agent/agent_radix_sort_downsweep.cuh  |  772 ++
 .../cub/agent/agent_radix_sort_upsweep.cuh    |  526 ++
 external/cub/cub/agent/agent_reduce.cuh       |  385 +
 .../cub/cub/agent/agent_reduce_by_key.cuh     |  549 ++
 external/cub/cub/agent/agent_rle.cuh          |  837 ++
 external/cub/cub/agent/agent_scan.cuh         |  471 ++
 .../cub/cub/agent/agent_segment_fixup.cuh     |  375 +
 external/cub/cub/agent/agent_select_if.cuh    |  703 ++
 external/cub/cub/agent/agent_spmv_orig.cuh    |  670 ++
 .../cub/agent/single_pass_scan_operators.cuh  |  815 ++
 .../cub/block/block_adjacent_difference.cuh   |  596 ++
 .../cub/cub/block/block_discontinuity.cuh     | 1148 +++
 external/cub/cub/block/block_exchange.cuh     | 1248 +++
 external/cub/cub/block/block_histogram.cuh    |  415 +
 external/cub/cub/block/block_load.cuh         | 1268 +++
 external/cub/cub/block/block_radix_rank.cuh   |  697 ++
 external/cub/cub/block/block_radix_sort.cuh   |  862 +++
 .../cub/cub/block/block_raking_layout.cuh     |  152 +
 external/cub/cub/block/block_reduce.cuh       |  607 ++
 external/cub/cub/block/block_scan.cuh         | 2126 +++++
 external/cub/cub/block/block_shuffle.cuh      |  305 +
 external/cub/cub/block/block_store.cuh        | 1000 +++
 .../block_histogram_atomic.cuh                |   82 +
 .../specializations/block_histogram_sort.cuh  |  226 +
 .../specializations/block_reduce_raking.cuh   |  222 +
 .../block_reduce_raking_commutative_only.cuh  |  199 +
 .../block_reduce_warp_reductions.cuh          |  222 +
 .../specializations/block_scan_raking.cuh     |  666 ++
 .../specializations/block_scan_warp_scans.cuh |  392 +
 .../block_scan_warp_scans2.cuh                |  436 ++
 .../block_scan_warp_scans3.cuh                |  418 +
 external/cub/cub/cub.cuh                      |   95 +
 external/cub/cub/device/device_histogram.cuh  |  866 +++
 external/cub/cub/device/device_partition.cuh  |  273 +
 external/cub/cub/device/device_radix_sort.cuh |  796 ++
 external/cub/cub/device/device_reduce.cuh     |  734 ++
 .../cub/device/device_run_length_encode.cuh   |  278 +
 external/cub/cub/device/device_scan.cuh       |  443 ++
 .../device/device_segmented_radix_sort.cuh    |  875 +++
 .../cub/device/device_segmented_reduce.cuh    |  619 ++
 external/cub/cub/device/device_select.cuh     |  369 +
 external/cub/cub/device/device_spmv.cuh       |  174 +
 .../device/dispatch/dispatch_histogram.cuh    | 1096 +++
 .../device/dispatch/dispatch_radix_sort.cuh   | 1652 ++++
 .../cub/device/dispatch/dispatch_reduce.cuh   |  882 +++
 .../dispatch/dispatch_reduce_by_key.cuh       |  554 ++
 .../cub/cub/device/dispatch/dispatch_rle.cuh  |  538 ++
 .../cub/cub/device/dispatch/dispatch_scan.cuh |  563 ++
 .../device/dispatch/dispatch_select_if.cuh    |  542 ++
 .../device/dispatch/dispatch_spmv_orig.cuh    |  834 ++
 external/cub/cub/grid/grid_barrier.cuh        |  211 +
 external/cub/cub/grid/grid_even_share.cuh     |  222 +
 external/cub/cub/grid/grid_mapping.cuh        |  113 +
 external/cub/cub/grid/grid_queue.cuh          |  220 +
 external/cub/cub/host/mutex.cuh               |  171 +
 .../cub/iterator/arg_index_input_iterator.cuh |  259 +
 .../cache_modified_input_iterator.cuh         |  240 +
 .../cache_modified_output_iterator.cuh        |  254 +
 .../cub/iterator/constant_input_iterator.cuh  |  235 +
 .../cub/iterator/counting_input_iterator.cuh  |  228 +
 .../cub/iterator/discard_output_iterator.cuh  |  220 +
 .../cub/iterator/tex_obj_input_iterator.cuh   |  310 +
 .../cub/iterator/tex_ref_input_iterator.cuh   |  374 +
 .../cub/iterator/transform_input_iterator.cuh |  252 +
 external/cub/cub/thread/thread_load.cuh       |  438 ++
 external/cub/cub/thread/thread_operators.cuh  |  317 +
 external/cub/cub/thread/thread_reduce.cuh     |  152 +
 external/cub/cub/thread/thread_scan.cuh       |  268 +
 external/cub/cub/thread/thread_search.cuh     |  154 +
 external/cub/cub/thread/thread_store.cuh      |  422 +
 external/cub/cub/util_allocator.cuh           |  708 ++
 external/cub/cub/util_arch.cuh                |  151 +
 external/cub/cub/util_debug.cuh               |  145 +
 external/cub/cub/util_device.cuh              |  347 +
 external/cub/cub/util_macro.cuh               |  103 +
 external/cub/cub/util_namespace.cuh           |   46 +
 external/cub/cub/util_ptx.cuh                 |  729 ++
 external/cub/cub/util_type.cuh                | 1141 +++
 .../warp/specializations/warp_reduce_shfl.cuh |  551 ++
 .../warp/specializations/warp_reduce_smem.cuh |  375 +
 .../warp/specializations/warp_scan_shfl.cuh   |  656 ++
 .../warp/specializations/warp_scan_smem.cuh   |  397 +
 external/cub/cub/warp/warp_reduce.cuh         |  612 ++
 external/cub/cub/warp/warp_scan.cuh           |  936 +++
 external/cub/eclipse code style profile.xml   |  155 +
 external/cub/examples/block/Makefile          |  128 +
 .../block/example_block_radix_sort.cu         |  323 +
 .../examples/block/example_block_reduce.cu    |  290 +
 .../cub/examples/block/example_block_scan.cu  |  334 +
 external/cub/examples/block/reduce_by_key.cu  |   57 +
 external/cub/examples/device/Makefile         |  197 +
 .../example_device_partition_flagged.cu       |  233 +
 .../device/example_device_partition_if.cu     |  244 +
 .../device/example_device_radix_sort.cu       |  226 +
 .../examples/device/example_device_reduce.cu  |  180 +
 .../examples/device/example_device_scan.cu    |  186 +
 .../device/example_device_select_flagged.cu   |  233 +
 .../device/example_device_select_if.cu        |  242 +
 .../device/example_device_select_unique.cu    |  221 +
 ...ample_device_sort_find_non_trivial_runs.cu |  384 +
 external/cub/experimental/Makefile            |  125 +
 .../experimental/defunct/example_coo_spmv.cu  | 1070 +++
 .../defunct/test_device_seg_reduce.cu         | 2142 +++++
 .../experimental/histogram/histogram_cub.h    |  109 +
 .../histogram/histogram_gmem_atomics.h        |  185 +
 .../histogram/histogram_smem_atomics.h        |  195 +
 .../cub/experimental/histogram_compare.cu     |  635 ++
 external/cub/experimental/sparse_matrix.h     | 1244 +++
 external/cub/experimental/spmv_compare.cu     |  917 +++
 external/cub/experimental/spmv_script.sh      |   30 +
 external/cub/test/Makefile                    |  453 ++
 external/cub/test/link_a.cu                   |   11 +
 external/cub/test/link_b.cu                   |   11 +
 external/cub/test/link_main.cpp               |   10 +
 external/cub/test/mersenne.h                  |  160 +
 external/cub/test/test_allocator.cu           |  459 ++
 external/cub/test/test_block_histogram.cu     |  310 +
 external/cub/test/test_block_load_store.cu    |  549 ++
 external/cub/test/test_block_radix_sort.cu    |  717 ++
 external/cub/test/test_block_reduce.cu        |  822 ++
 external/cub/test/test_block_scan.cu          |  929 +++
 external/cub/test/test_device_histogram.cu    | 1669 ++++
 external/cub/test/test_device_radix_sort.cu   | 1275 +++
 external/cub/test/test_device_reduce.cu       | 1339 ++++
 .../cub/test/test_device_reduce_by_key.cu     |  853 ++
 .../cub/test/test_device_run_length_encode.cu |  890 +++
 external/cub/test/test_device_scan.cu         | 1015 +++
 external/cub/test/test_device_select_if.cu    | 1039 +++
 .../cub/test/test_device_select_unique.cu     |  651 ++
 external/cub/test/test_grid_barrier.cu        |  152 +
 external/cub/test/test_iterator.cu            |  805 ++
 external/cub/test/test_util.h                 | 1600 ++++
 external/cub/test/test_warp_reduce.cu         |  840 ++
 external/cub/test/test_warp_scan.cu           |  630 ++
 external/cub/tune/Makefile                    |  192 +
 external/cub/tune/tune_device_reduce.cu       |  763 ++
 include/functions.h                           |    2 +
 python/bfs/bfs_wrapper.cpp                    | 6885 +++++++++++++++++
 python/bfs/bfs_wrapper.pyx                    |  193 +
 python/bfs/c_bfs.pxd                          |   75 +
 setup.py                                      |    2 +-
 src/bfs.cu                                    |  470 ++
 src/bfs.cuh                                   |  101 +
 src/bfs_kernels.cuh                           | 1560 ++++
 src/cugraph.cu                                |   26 +-
 src/utilities/sm_utils.h                      |  280 +
 152 files changed, 83105 insertions(+), 3 deletions(-)
 create mode 100644 external/cub/CHANGE_LOG.TXT
 create mode 100644 external/cub/LICENSE.TXT
 create mode 100644 external/cub/README.md
 create mode 100644 external/cub/common.mk
 create mode 100644 external/cub/cub/agent/agent_histogram.cuh
 create mode 100644 external/cub/cub/agent/agent_radix_sort_downsweep.cuh
 create mode 100644 external/cub/cub/agent/agent_radix_sort_upsweep.cuh
 create mode 100644 external/cub/cub/agent/agent_reduce.cuh
 create mode 100644 external/cub/cub/agent/agent_reduce_by_key.cuh
 create mode 100644 external/cub/cub/agent/agent_rle.cuh
 create mode 100644 external/cub/cub/agent/agent_scan.cuh
 create mode 100644 external/cub/cub/agent/agent_segment_fixup.cuh
 create mode 100644 external/cub/cub/agent/agent_select_if.cuh
 create mode 100644 external/cub/cub/agent/agent_spmv_orig.cuh
 create mode 100644 external/cub/cub/agent/single_pass_scan_operators.cuh
 create mode 100644 external/cub/cub/block/block_adjacent_difference.cuh
 create mode 100644 external/cub/cub/block/block_discontinuity.cuh
 create mode 100644 external/cub/cub/block/block_exchange.cuh
 create mode 100644 external/cub/cub/block/block_histogram.cuh
 create mode 100644 external/cub/cub/block/block_load.cuh
 create mode 100644 external/cub/cub/block/block_radix_rank.cuh
 create mode 100644 external/cub/cub/block/block_radix_sort.cuh
 create mode 100644 external/cub/cub/block/block_raking_layout.cuh
 create mode 100644 external/cub/cub/block/block_reduce.cuh
 create mode 100644 external/cub/cub/block/block_scan.cuh
 create mode 100644 external/cub/cub/block/block_shuffle.cuh
 create mode 100644 external/cub/cub/block/block_store.cuh
 create mode 100644 external/cub/cub/block/specializations/block_histogram_atomic.cuh
 create mode 100644 external/cub/cub/block/specializations/block_histogram_sort.cuh
 create mode 100644 external/cub/cub/block/specializations/block_reduce_raking.cuh
 create mode 100644 external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
 create mode 100644 external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
 create mode 100644 external/cub/cub/block/specializations/block_scan_raking.cuh
 create mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans.cuh
 create mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
 create mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
 create mode 100644 external/cub/cub/cub.cuh
 create mode 100644 external/cub/cub/device/device_histogram.cuh
 create mode 100644 external/cub/cub/device/device_partition.cuh
 create mode 100644 external/cub/cub/device/device_radix_sort.cuh
 create mode 100644 external/cub/cub/device/device_reduce.cuh
 create mode 100644 external/cub/cub/device/device_run_length_encode.cuh
 create mode 100644 external/cub/cub/device/device_scan.cuh
 create mode 100644 external/cub/cub/device/device_segmented_radix_sort.cuh
 create mode 100644 external/cub/cub/device/device_segmented_reduce.cuh
 create mode 100644 external/cub/cub/device/device_select.cuh
 create mode 100644 external/cub/cub/device/device_spmv.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_histogram.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_reduce.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_rle.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_scan.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_select_if.cuh
 create mode 100644 external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
 create mode 100644 external/cub/cub/grid/grid_barrier.cuh
 create mode 100644 external/cub/cub/grid/grid_even_share.cuh
 create mode 100644 external/cub/cub/grid/grid_mapping.cuh
 create mode 100644 external/cub/cub/grid/grid_queue.cuh
 create mode 100644 external/cub/cub/host/mutex.cuh
 create mode 100644 external/cub/cub/iterator/arg_index_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/cache_modified_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/cache_modified_output_iterator.cuh
 create mode 100644 external/cub/cub/iterator/constant_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/counting_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/discard_output_iterator.cuh
 create mode 100644 external/cub/cub/iterator/tex_obj_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/tex_ref_input_iterator.cuh
 create mode 100644 external/cub/cub/iterator/transform_input_iterator.cuh
 create mode 100644 external/cub/cub/thread/thread_load.cuh
 create mode 100644 external/cub/cub/thread/thread_operators.cuh
 create mode 100644 external/cub/cub/thread/thread_reduce.cuh
 create mode 100644 external/cub/cub/thread/thread_scan.cuh
 create mode 100644 external/cub/cub/thread/thread_search.cuh
 create mode 100644 external/cub/cub/thread/thread_store.cuh
 create mode 100644 external/cub/cub/util_allocator.cuh
 create mode 100644 external/cub/cub/util_arch.cuh
 create mode 100644 external/cub/cub/util_debug.cuh
 create mode 100644 external/cub/cub/util_device.cuh
 create mode 100644 external/cub/cub/util_macro.cuh
 create mode 100644 external/cub/cub/util_namespace.cuh
 create mode 100644 external/cub/cub/util_ptx.cuh
 create mode 100644 external/cub/cub/util_type.cuh
 create mode 100644 external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
 create mode 100644 external/cub/cub/warp/specializations/warp_reduce_smem.cuh
 create mode 100644 external/cub/cub/warp/specializations/warp_scan_shfl.cuh
 create mode 100644 external/cub/cub/warp/specializations/warp_scan_smem.cuh
 create mode 100644 external/cub/cub/warp/warp_reduce.cuh
 create mode 100644 external/cub/cub/warp/warp_scan.cuh
 create mode 100644 external/cub/eclipse code style profile.xml
 create mode 100644 external/cub/examples/block/Makefile
 create mode 100644 external/cub/examples/block/example_block_radix_sort.cu
 create mode 100644 external/cub/examples/block/example_block_reduce.cu
 create mode 100644 external/cub/examples/block/example_block_scan.cu
 create mode 100644 external/cub/examples/block/reduce_by_key.cu
 create mode 100644 external/cub/examples/device/Makefile
 create mode 100644 external/cub/examples/device/example_device_partition_flagged.cu
 create mode 100644 external/cub/examples/device/example_device_partition_if.cu
 create mode 100644 external/cub/examples/device/example_device_radix_sort.cu
 create mode 100644 external/cub/examples/device/example_device_reduce.cu
 create mode 100644 external/cub/examples/device/example_device_scan.cu
 create mode 100644 external/cub/examples/device/example_device_select_flagged.cu
 create mode 100644 external/cub/examples/device/example_device_select_if.cu
 create mode 100644 external/cub/examples/device/example_device_select_unique.cu
 create mode 100644 external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
 create mode 100644 external/cub/experimental/Makefile
 create mode 100644 external/cub/experimental/defunct/example_coo_spmv.cu
 create mode 100644 external/cub/experimental/defunct/test_device_seg_reduce.cu
 create mode 100644 external/cub/experimental/histogram/histogram_cub.h
 create mode 100644 external/cub/experimental/histogram/histogram_gmem_atomics.h
 create mode 100644 external/cub/experimental/histogram/histogram_smem_atomics.h
 create mode 100644 external/cub/experimental/histogram_compare.cu
 create mode 100644 external/cub/experimental/sparse_matrix.h
 create mode 100644 external/cub/experimental/spmv_compare.cu
 create mode 100755 external/cub/experimental/spmv_script.sh
 create mode 100644 external/cub/test/Makefile
 create mode 100644 external/cub/test/link_a.cu
 create mode 100644 external/cub/test/link_b.cu
 create mode 100644 external/cub/test/link_main.cpp
 create mode 100644 external/cub/test/mersenne.h
 create mode 100644 external/cub/test/test_allocator.cu
 create mode 100644 external/cub/test/test_block_histogram.cu
 create mode 100644 external/cub/test/test_block_load_store.cu
 create mode 100644 external/cub/test/test_block_radix_sort.cu
 create mode 100644 external/cub/test/test_block_reduce.cu
 create mode 100644 external/cub/test/test_block_scan.cu
 create mode 100644 external/cub/test/test_device_histogram.cu
 create mode 100644 external/cub/test/test_device_radix_sort.cu
 create mode 100644 external/cub/test/test_device_reduce.cu
 create mode 100644 external/cub/test/test_device_reduce_by_key.cu
 create mode 100644 external/cub/test/test_device_run_length_encode.cu
 create mode 100644 external/cub/test/test_device_scan.cu
 create mode 100644 external/cub/test/test_device_select_if.cu
 create mode 100644 external/cub/test/test_device_select_unique.cu
 create mode 100644 external/cub/test/test_grid_barrier.cu
 create mode 100644 external/cub/test/test_iterator.cu
 create mode 100644 external/cub/test/test_util.h
 create mode 100644 external/cub/test/test_warp_reduce.cu
 create mode 100644 external/cub/test/test_warp_scan.cu
 create mode 100644 external/cub/tune/Makefile
 create mode 100644 external/cub/tune/tune_device_reduce.cu
 create mode 100644 python/bfs/bfs_wrapper.cpp
 create mode 100644 python/bfs/bfs_wrapper.pyx
 create mode 100644 python/bfs/c_bfs.pxd
 create mode 100644 src/bfs.cu
 create mode 100755 src/bfs.cuh
 create mode 100644 src/bfs_kernels.cuh
 create mode 100644 src/utilities/sm_utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b42a8d079e5..624825aaa71 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,7 @@ endif()
 include_directories(
     "${CMAKE_CURRENT_SOURCE_DIR}/include" 
     "${CMAKE_CURRENT_SOURCE_DIR}/src"
+    "${CMAKE_CURRENT_SOURCE_DIR}/external/cub"
     "${CUDA_INCLUDE_DIRS}" 
     "${CUDF_INCLUDE}"
     "${CMAKE_CURRENT_BINARY_DIR}/gunrock/"
@@ -147,6 +148,7 @@ cuda_add_library(cugraph SHARED
     src/grmat.cu
     src/cugraph.cu
     src/pagerank.cu
+    src/bfs.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/test_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/error_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/misc_utils.cu
diff --git a/external/cub/CHANGE_LOG.TXT b/external/cub/CHANGE_LOG.TXT
new file mode 100644
index 00000000000..43860e691e7
--- /dev/null
+++ b/external/cub/CHANGE_LOG.TXT
@@ -0,0 +1,381 @@
+1.7.4    09/20/2017
+    - Bug fixes: 
+        - Issue #114: Can't pair non-trivially-constructible values in radix sort
+        - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 
+          		  
+//-----------------------------------------------------------------------------
+
+1.7.3    08/28/2017
+    - Bug fixes: 
+        - Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs
+          		  
+//-----------------------------------------------------------------------------
+
+1.7.2    08/26/2017
+    - Bug fixes: 
+        - Issue #104: Device-wide reduction is now "run-to-run" deterministic for 
+          pseudo-associative reduction operators (like floating point addition)
+          		  
+//-----------------------------------------------------------------------------
+
+1.7.1    08/18/2017
+    - Updated Volta radix sorting tuning policies 
+    - Bug fixes: 
+        - Issue #104 (uint64_t warp-reduce broken for cub 1.7.0 on cuda 8 and older)
+        - Issue #103 (Can't mix Thrust 9.0 and CUB)
+        - Issue #102 (CUB pulls in windows.h which defines min/max macros that conflict with std::min/std::max)
+        - Issue #99 (Radix sorting crashes NVCC on Windows 10 for SM52)
+        - Issue #98 (cuda-memcheck: --tool initcheck failed with lineOfSight)
+        - Issue #94 (Git clone size)
+        - Issue #93 (accept iterators for segment offsets)
+        - Issue #87 (CUB uses anonymous unions which is not valid C++)
+        - Issue #44 (Check for C++ 11 should be changed that Visual Studio 2013 is also recognized as C++ 11 capable)
+          		  
+//-----------------------------------------------------------------------------
+
+1.7.0    06/07/2017
+    - Compatible with CUDA9 and SM7.x (Volta) independent thread scheduling 
+    - API change: remove cub::WarpAll() and cub::WarpAny().  These functions served to 
+      emulate __all and __any functionality for SM1.x devices, which did not have those 
+      operations.  However, the SM1.x devices are now deprecated in CUDA, and the 
+      interfaces of the these two functions are now lacking the lane-mask needed 
+      for collectives to run on Volta SMs having independent thread scheduling. 
+    - Bug fixes: 
+        - Issue #86 Incorrect results with ReduceByKey
+          		  
+//-----------------------------------------------------------------------------
+
+1.6.4    12/06/2016
+    - Updated sm_5x, sm_6x tuning policies for radix sorting (3.5B and 3.4B 
+      32b keys/s on TitanX and GTX 1080, respectively)
+    - Bug fixes: 
+        - Restore fence work-around for scan (reduce-by-key, etc.) hangs 
+          in CUDA 8.5
+        - Issue 65: DeviceSegmentedRadixSort should allow inputs to have 
+          pointer-to-const type 
+        - Mollify Clang device-side warnings
+        - Remove out-dated VC project files
+          		  
+//-----------------------------------------------------------------------------
+
+1.6.3    11/20/2016
+    - API change: BlockLoad and BlockStore are now templated by the local
+      data type, instead of the Iterator type.  This allows for output iterators
+      having \p void as their \p value_type (e.g., discard iterators).
+    - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
+    - Bug fixes: 
+        - Issue #74: Warpreduce executes reduction operator for out-of-bounds items
+        - Issue #72 (cub:InequalityWrapper::operator() should be non-const)
+        - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
+		- Issue #70 1.5.3 breaks BlockScan API.  Retroactively reversioned
+		  from v1.5.3 -> v1.6 to appropriately indicate API change.
+		- Issue #69 cub::BlockStore::Store doesn't compile if OutputIteratorT::value_type != T  
+        - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx 
+          arch specialization)
+		- Improved support for Win32 platforms (warnings, alignment, etc)
+		  
+//-----------------------------------------------------------------------------
+
+1.6.2 (was 1.5.5)    10/25/2016
+    - Updated Pascal tuning policies for radix sorting
+    - Bug fixes: 
+        - Fix for arm64 compilation of caching allocator
+
+//-----------------------------------------------------------------------------
+
+1.6.1 (was 1.5.4)    10/14/2016
+    - Bug fixes: 
+        - Fix for radix sorting bug introduced by scan refactorization
+
+//-----------------------------------------------------------------------------
+
+1.6.0 (was 1.5.3)    10/11/2016
+    - API change: Device/block/warp-wide exclusive scans have been revised to now 
+      accept an "initial value" (instead of an "identity value") for seeding the 
+      computation with an arbitrary prefix.  
+    - API change: Device-wide reductions and scans can now have input sequence types that are 
+      different from output sequence types (as long as they are coercible)
+      value") for seeding the computation with an arbitrary prefix
+    - Reduce repository size (move doxygen binary to doc repository)
+    - Minor reductions in block-scan instruction count
+    - Bug fixes: 
+        - Issue #55: warning in cub/device/dispatch/dispatch_reduce_by_key.cuh 
+        - Issue #59: cub::DeviceScan::ExclusiveSum can't prefix sum of float into double
+        - Issue #58: Infinite loop in cub::CachingDeviceAllocator::NearestPowerOf
+        - Issue #47: Caching allocator needs to clean up cuda error upon successful retry 
+        - Issue #46: Very high amount of needed memory from the cub::DeviceHistogram::HistogramEven routine
+        - Issue #45: Caching Device Allocator fails with debug output enabled
+        - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
+
+//-----------------------------------------------------------------------------
+
+1.5.2    03/21/2016
+	- Improved medium-size scan performance for sm5x (Maxwell)
+    - Refactored caching allocator for device memory
+   		- Spends less time locked
+		- Failure to allocate a block from the runtime will retry once after
+		  freeing cached allocations
+		- Now respects max-bin (issue where blocks in excess of max-bin were
+		  still being retained in free cache)
+		- Uses C++11 mutex when available
+    - Bug fixes: 
+        - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
+          
+//-----------------------------------------------------------------------------
+
+1.5.1    12/28/2015
+    - Bug fixes: 
+        - Fix for incorrect DeviceRadixSort output for some small problems on 
+          Maxwell SM52 architectures
+        - Fix for macro redefinition warnings when compiling with Thrust sort
+          
+//-----------------------------------------------------------------------------
+
+1.5.0    12/14/2015
+    - New Features:
+        - Added new segmented device-wide operations for device-wide sort and 
+          reduction primitives.
+    - Bug fixes: 
+        - Fix for Git Issue 36 (Compilation error with GCC 4.8.4 nvcc 7.0.27) and
+          Forums thread (ThreadLoad generates compiler errors when loading from 
+          pointer-to-const)
+        - Fix for Git Issue 29 (DeviceRadixSort::SortKeys<bool> yields compiler 
+          errors)
+        - Fix for Git Issue 26 (CUDA error: misaligned address after 
+          cub::DeviceRadixSort::SortKeys())
+        - Fix for incorrect/crash on 0-length problems, e.g., Git Issue 25 (Floating 
+          point exception (core dumped) during cub::DeviceRadixSort::SortKeys)
+        - Fix for CUDA 7.5 issues on SM 5.2 with SHFL-based warp-scan and warp-reduction 
+          on non-primitive data types (e.g., user-defined structs)
+        - Fix for small radix sorting problems where 0 temporary bytes were 
+          required and users code was invoking malloc(0) on some systems where
+          that returns NULL.  (Impl assumed was asking for size again and was not 
+          running the sort.)
+          
+//-----------------------------------------------------------------------------
+
+1.4.1    04/13/2015
+    - Bug fixes: 
+        - Fixes for CUDA 7.0 issues with SHFL-based warp-scan and warp-reduction 
+          on non-primitive data types (e.g., user-defined structs)
+        - Fixes for minor CUDA 7.0 performance regressions in cub::DeviceScan,
+          DeviceReduceByKey
+        - Fixes to allow cub::DeviceRadixSort and cub::BlockRadixSort on bool types
+        - Remove requirement for callers to define the CUB_CDP macro 
+          when invoking CUB device-wide rountines using CUDA dynamic parallelism
+        - Fix for headers not being included in the proper order (or missing includes)
+          for some block-wide functions
+          
+//-----------------------------------------------------------------------------
+
+1.4.0    03/18/2015
+    - New Features:
+		- Support and performance tuning for new Maxwell GPU architectures
+        - Updated cub::DeviceHistogram implementation that provides the same 
+          "histogram-even" and "histogram-range" functionality as IPP/NPP.
+          Provides extremely fast and, perhaps more importantly, very 
+          uniform performance response across diverse real-world datasets, 
+          including pathological (homogeneous) sample distributions (resilience)
+        - New cub::DeviceSpmv methods for multiplying sparse matrices by 
+          dense vectors, load-balanced using a merge-based parallel decomposition.
+        - New cub::DeviceRadixSort sorting entry-points that always return
+          the sorted output into the specified buffer (as opposed to the 
+          cub::DoubleBuffer in which it could end up in either buffer)
+        - New cub::DeviceRunLengthEncode::NonTrivialRuns for finding the starting 
+          offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in 
+          a given sequence.  (Useful for top-down partitioning algorithms like 
+          MSD sorting of very-large keys.)
+          
+//-----------------------------------------------------------------------------
+
+1.3.2    07/28/2014
+    - Bug fixes: 
+        - Fix for cub::DeviceReduce where reductions of small problems 
+          (small enough to only dispatch a single thread block) would run in 
+          the default stream (stream zero) regardless of whether an alternate
+          stream was specified.  
+          
+//-----------------------------------------------------------------------------
+
+1.3.1    05/23/2014
+    - Bug fixes: 
+        - Workaround for a benign WAW race warning reported by cuda-memcheck
+          in BlockScan specialized for BLOCK_SCAN_WARP_SCANS algorithm.
+        - Fix for bug in DeviceRadixSort where the algorithm may sort more 
+          key bits than the caller specified (up to the nearest radix digit).
+        - Fix for ~3% DeviceRadixSort performance regression on Kepler and 
+          Fermi that was introduced in v1.3.0.  
+
+//-----------------------------------------------------------------------------
+
+1.3.0    05/12/2014
+    - New features:
+        - CUB's collective (block-wide, warp-wide) primitives underwent a minor 
+          interface refactoring:
+            - To provide the appropriate support for multidimensional thread blocks,
+              The interfaces for collective classes are now template-parameterized 
+              by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being 
+              optional, and BLOCK_DIM_X replacing BLOCK_THREADS).  Furthermore, the 
+              constructors that accept remapped linear thread-identifiers have been 
+              removed: all primitives now assume a row-major thread-ranking for 
+              multidimensional thread blocks.  
+            - To allow the host program (compiled by the host-pass) to 
+              accurately determine the device-specific storage requirements for 
+              a given collective (compiled for each device-pass), the interfaces 
+              for collective classes are now (optionally) template-parameterized 
+              by the desired PTX compute capability. This is useful when 
+              aliasing collective storage to shared memory that has been 
+              allocated dynamically by the host at the kernel call site.   
+            - Most CUB programs having typical 1D usage should not require any 
+              changes to accomodate these updates.
+        - Added new "combination" WarpScan methods for efficiently computing 
+          both inclusive and exclusive prefix scans (and sums).
+    - Bug fixes: 
+        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
+          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
+          returned when parameterized for floating-point types (fp32, fp64).
+        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
+          (for debug instrumentation) 
+        - Misc. workaround-fixes for certain scan scenarios (using custom 
+          scan operators) where code compiled for SM1x is run on newer 
+          GPUs of higher compute-capability: the compiler could not tell
+          which memory space was being used collective operations and was 
+          mistakenly using global ops instead of shared ops. 
+
+//-----------------------------------------------------------------------------
+
+1.2.3    04/01/2014
+    - Bug fixes: 
+        - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
+        - Fixed code-snippet bug in ArgIndexInputIteratorT documentation 
+
+//-----------------------------------------------------------------------------
+
+1.2.2    03/03/2014
+    - New features:
+        - Added MS VC++ project solutions for device-wide and block-wide examples 
+    - Performance:
+        - Added a third algorithmic variant of cub::BlockReduce for improved performance
+          when using commutative operators (e.g., numeric addition)
+    - Bug fixes: 
+        - Fixed bug where inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly
+
+//-----------------------------------------------------------------------------
+
+1.2.0    02/25/2014
+    - New features:
+        - Added device-wide reduce-by-key (DeviceReduce::ReduceByKey, DeviceReduce::RunLengthEncode) 
+    - Performance
+        - Improved DeviceScan, DeviceSelect, DevicePartition performance
+    - Documentation and testing:
+        - Compatible with CUDA 6.0
+        - Added performance-portability plots for many device-wide primitives to doc 
+        - Update doc and tests to reflect iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older).
+    - Bug fixes 
+        - Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)  
+        - Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
+        - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
+        - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
+    
+//-----------------------------------------------------------------------------
+
+1.1.1    12/11/2013
+    - New features:
+        - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
+        - Added descending sorting to DeviceRadixSort and BlockRadixSort
+        - Added min, max, arg-min, and arg-max to DeviceReduce
+        - Added DeviceSelect (select-unique, select-if, and select-flagged)
+        - Added DevicePartition (partition-if, partition-flagged)
+        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
+        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
+    - Performance
+        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
+    - Interface changes:
+        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
+        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
+    - Documentation and testing:
+        - Added simple examples of device-wide methods
+        - Improved doxygen documentation and example snippets
+        - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
+    - Bug fixes 
+        - Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
+        - Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux 
+        - Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
+        - Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
+        - Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux
+        - Fixed BlockDiscontinuity bug: complation error in for types other than int32/uint32
+        - CDP (device-callable) versions of device-wide methods now report the same temporary storage allocation size requirement as their host-callable counterparts
+     
+
+//-----------------------------------------------------------------------------
+
+1.0.2    08/23/2013
+    - Corrections to code snippet examples for BlockLoad, BlockStore, and BlockDiscontinuity
+    - Cleaned up unnecessary/missing header includes.  You can now safely #inlude a specific .cuh (instead of cub.cuh)
+    - Bug/compilation fixes for BlockHistogram 
+
+//-----------------------------------------------------------------------------
+
+1.0.1    08/08/2013
+    - New collective interface idiom (specialize::construct::invoke).
+    - Added best-in-class DeviceRadixSort.  Implements short-circuiting for homogenous digit passes.
+    - Added best-in-class DeviceScan.  Implements single-pass "adaptive-lookback" strategy.
+    - Significantly improved documentation (with example code snippets) 
+    - More extensive regression test suit for aggressively testing collective variants
+    - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types)
+    - Improved support for Kepler SHFL (collective ops now use SHFL for types larger than 32b)
+    - Better code generation for 64-bit addressing within BlockLoad/BlockStore
+    - DeviceHistogram now supports histograms of arbitrary bins
+    - Misc. fixes
+      - Workarounds for SM10 codegen issues in uncommonly-used WarpScan/Reduce specializations
+      - Updates to accommodate CUDA 5.5 dynamic parallelism   
+
+
+//-----------------------------------------------------------------------------
+
+0.9.4    05/07/2013
+
+    - Fixed compilation errors for SM10-SM13
+    - Fixed compilation errors for some WarpScan entrypoints on SM30+
+    - Added block-wide histogram (BlockHistogram256)
+    - Added device-wide histogram (DeviceHistogram256)
+    - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which 
+      trades more register consumption for less shared memory I/O)
+    - Updates to BlockRadixRank to use BlockScan (which improves performance
+      on Kepler due to SHFL instruction)
+    - Allow types other than C++ primitives to be used in WarpScan::*Sum methods 
+      if they only have operator + overloaded.  (Previously they also required 
+      to support assignment from int(0).) 
+    - Update BlockReduce's BLOCK_REDUCE_WARP_REDUCTIONS algorithm to work even 
+      when block size is not an even multiple of warp size
+    - Added work management utility descriptors (GridQueue, GridEvenShare)
+    - Refactoring of DeviceAllocator interface and CachingDeviceAllocator 
+      implementation 
+    - Misc. documentation updates and corrections. 
+     
+//-----------------------------------------------------------------------------
+
+0.9.2    04/04/2013
+
+    - Added WarpReduce.  WarpReduce uses the SHFL instruction when applicable. 
+      BlockReduce now uses this WarpReduce instead of implementing its own.
+    - Misc. fixes for 64-bit Linux compilation warnings and errors.
+    - Misc. documentation updates and corrections. 
+
+//-----------------------------------------------------------------------------
+
+0.9.1    03/09/2013
+
+    - Fix for ambiguity in BlockScan::Reduce() between generic reduction and 
+      summation.  Summation entrypoints are now called ::Sum(), similar to the 
+      convention in BlockScan.
+    - Small edits to mainpage documentation and download tracking
+    
+//-----------------------------------------------------------------------------
+
+0.9.0    03/07/2013    
+
+    - Intial "preview" release.    CUB is the first durable, high-performance library 
+      of cooperative block-level, warp-level, and thread-level primitives for CUDA 
+      kernel programming.  More primitives and examples coming soon!
+    
\ No newline at end of file
diff --git a/external/cub/LICENSE.TXT b/external/cub/LICENSE.TXT
new file mode 100644
index 00000000000..db122453f9c
--- /dev/null
+++ b/external/cub/LICENSE.TXT
@@ -0,0 +1,24 @@
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/external/cub/README.md b/external/cub/README.md
new file mode 100644
index 00000000000..c107d673d59
--- /dev/null
+++ b/external/cub/README.md
@@ -0,0 +1,128 @@
+<hr>
+<h3>About CUB</h3>
+
+Current release: v1.7.4 (09/20/2017)
+
+We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
+
+CUB provides state-of-the-art, reusable software components for every layer 
+of the CUDA programming model:
+- [<b><em>Device-wide primitives</em></b>] (https://nvlabs.github.com/cub/group___device_module.html) 
+  - Sort, prefix scan, reduction, histogram, etc.  
+  - Compatible with CUDA dynamic parallelism
+- [<b><em>Block-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___block_module.html)
+  - I/O, sort, prefix scan, reduction, histogram, etc.  
+  - Compatible with arbitrary thread block sizes and types 
+- [<b><em>Warp-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___warp_module.html)
+  - Warp-wide prefix scan, reduction, etc.
+  - Safe and architecture-specific
+- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
+  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. 
+
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
+
+<br><hr>
+<h3>A Simple Example</h3>
+
+```C++
+#include <cub/cub.cuh>
+ 
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
+     // owning 16 integer items each
+     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
+     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
+ 
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load; 
+         typename BlockStore::TempStorage      store; 
+     } temp_storage; 
+
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
+
+     // Store the sorted segment 
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses cub::BlockRadixSort to collectively sort 
+its own input segment.  The class is specialized by the 
+data type being sorted, by the number of threads per block, by the number of 
+keys per thread, and implicitly by the targeted compilation architecture.  
+
+The cub::BlockLoad and cub::BlockStore classes are similarly specialized.    
+Furthermore, to provide coalesced accesses to device memory, these primitives are 
+configured to access memory using a striped access pattern (where consecutive threads 
+simultaneously access consecutive items) and then <em>transpose</em> the keys into 
+a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads. 
+
+Once specialized, these classes expose opaque \p TempStorage member types.  
+The thread block uses these storage types to statically allocate the union of 
+shared memory needed by the thread block.  (Alternatively these storage types 
+could be aliased to global memory allocations).
+
+<br><hr>
+<h3>Stable Releases</h3>
+
+CUB releases are labeled using version identifiers having three fields: 
+*epoch.feature.update*.  The *epoch* field corresponds to support for
+a major change in the CUDA programming model.  The *feature* field 
+corresponds to a stable set of features, functionality, and interface.  The
+*update* field corresponds to a bug-fix or performance update for that
+feature set.  At the moment, we do not publicly provide non-stable releases 
+such as development snapshots, beta releases or rolling releases.  (Feel free
+to contact us if you would like such things.)  See the 
+[CUB Project Website](http://nvlabs.github.com/cub) for more information.
+
+<br><hr>
+<h3>Contributors</h3>
+
+CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com).  The primary contributor is [Duane Merrill](http://github.com/dumerrill).
+
+<br><hr>
+<h3>Open Source License</h3>
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
diff --git a/external/cub/common.mk b/external/cub/common.mk
new file mode 100644
index 00000000000..71d9880c5f5
--- /dev/null
+++ b/external/cub/common.mk
@@ -0,0 +1,233 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
+  
+COMMA = ,
+ifdef sm
+	SM_ARCH = $(subst $(COMMA),-,$(sm))
+else 
+    SM_ARCH = 200
+endif
+
+ifeq (700, $(findstring 700, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\" 
+    SM_DEF 		+= -DSM700
+    TEST_ARCH 	= 700
+endif
+ifeq (620, $(findstring 620, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\" 
+    SM_DEF 		+= -DSM620
+    TEST_ARCH 	= 620
+endif
+ifeq (610, $(findstring 610, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\" 
+    SM_DEF 		+= -DSM610
+    TEST_ARCH 	= 610
+endif
+ifeq (600, $(findstring 600, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\" 
+    SM_DEF 		+= -DSM600
+    TEST_ARCH 	= 600
+endif
+ifeq (520, $(findstring 520, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\" 
+    SM_DEF 		+= -DSM520
+    TEST_ARCH 	= 520
+endif
+ifeq (370, $(findstring 370, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\" 
+    SM_DEF 		+= -DSM370
+    TEST_ARCH 	= 370
+endif
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
+    SM_DEF 		+= -DSM350
+    TEST_ARCH 	= 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+    SM_DEF 		+= -DSM300
+    TEST_ARCH 	= 300
+endif
+ifeq (210, $(findstring 210, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_21,compute_20\"
+    SM_DEF 		+= -DSM210
+    TEST_ARCH 	= 210
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+    SM_DEF 		+= -DSM200
+    TEST_ARCH 	= 200
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
+    SM_DEF 		+= -DSM130
+    TEST_ARCH 	= 130
+endif
+ifeq (120, $(findstring 120, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_12,code=\"sm_12,compute_12\" 
+    SM_DEF 		+= -DSM120
+    TEST_ARCH 	= 120
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
+    SM_DEF 		+= -DSM110
+    TEST_ARCH 	= 110
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
+    SM_DEF 		+= -DSM100
+    TEST_ARCH 	= 100
+endif
+
+
+# [cdp=<0|1>] CDP enable option (default: no)
+ifeq ($(cdp), 1)
+	DEFINES += -DCUB_CDP
+	CDP_SUFFIX = cdp
+    NVCCFLAGS += -rdc=true -lcudadevrt
+else
+	CDP_SUFFIX = nocdp
+endif
+
+
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default) 
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+    NPPI = -lnppist
+endif
+
+
+# [abi=<0|1>] CUDA ABI option (enabled by default) 
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else 
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+
+# [open64=<0|1>] Middle-end compiler option (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else 
+	PTX_SUFFIX = nvvm
+endif
+
+
+# [verbose=<0|1>] Verbose toolchain output from nvcc option
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+
+# [keep=<0|1>] Keep intermediate compilation artifacts option
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# [debug=<0|1>] Generate debug mode code
+ifeq ($(debug), 1)
+	NVCCFLAGS += -G
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
+
+NVCC = "$(shell which nvcc)"
+ifdef nvccver
+    NVCC_VERSION = $(nvccver)
+else
+    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+    # For MSVC
+    # Enable more warnings and treat as errors
+    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler /fp:strict
+    # Help the compiler/linker work with huge numbers of kernels on Windows
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+	CC = cl
+	
+	# Multithreaded runtime
+	NVCCFLAGS += -Xcompiler /MT
+	
+ifneq ($(force32), 1)
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
+else
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
+endif
+	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
+else
+    # For g++
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler -ffloat-store
+    CC = g++
+ifneq ($(force32), 1)
+    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
+else
+    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
+endif
+endif
+
+# Suffix to append to each binary
+BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
+			$(CUB_DIR)common.mk
+		
diff --git a/external/cub/cub/agent/agent_histogram.cuh b/external/cub/cub/agent/agent_histogram.cuh
new file mode 100644
index 00000000000..3b6cc4c92bc
--- /dev/null
+++ b/external/cub/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_radix_sort_downsweep.cuh b/external/cub/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 00000000000..0eee5f4ebf1
--- /dev/null
+++ b/external/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,772 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                         _BLOCK_THREADS,         ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
+    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortDownsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        volatile OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        volatile OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        CTA_SYNC();
+
+        ValueT values[ITEMS_PER_THREAD];
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_radix_sort_upsweep.cuh b/external/cub/cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 00000000000..803fadf2486
--- /dev/null
+++ b/external/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,526 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< Threads per thread block
+    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
+    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortUpsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_reduce.cuh b/external/cub/cub/agent/agent_reduce.cuh
new file mode 100644
index 00000000000..5528d8bdd64
--- /dev/null
+++ b/external/cub/cub/agent/agent_reduce.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        = d_wrapped_in[block_offset + thread_offset];
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_reduce_by_key.cuh b/external/cub/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 00000000000..a57d60ea210
--- /dev/null
+++ b/external/cub/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,549 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        ValueOutputT        total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate.value;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = reduction_op(
+                                        prefix_op.GetExclusivePrefix().value,
+                                        block_aggregate.value);
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_rle.cuh b/external/cub/cub/agent/agent_rle.cuh
new file mode 100644
index 00000000000..0ba9216176c
--- /dev/null
+++ b/external/cub/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,       ///< Tile offset
+        ScanTileStateT       &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_scan.cuh b/external/cub/cub/agent/agent_scan.cuh
new file mode 100644
index 00000000000..567df8049e9
--- /dev/null
+++ b/external/cub/cub/agent/agent_scan.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_segment_fixup.cuh b/external/cub/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 00000000000..efa6d8693ff
--- /dev/null
+++ b/external/cub/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_select_if.cuh b/external/cub/cub/agent/agent_select_if.cuh
new file mode 100644
index 00000000000..f365481915b
--- /dev/null
+++ b/external/cub/cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/agent_spmv_orig.cuh b/external/cub/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 00000000000..4e7cb609f76
--- /dev/null
+++ b/external/cub/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/agent/single_pass_scan_operators.cuh b/external/cub/cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 00000000000..2f6713792dd
--- /dev/null
+++ b/external/cub/cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,815 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../util_arch.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3];
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3];
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_adjacent_difference.cuh b/external/cub/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 00000000000..1125fe59cea
--- /dev/null
+++ b/external/cub/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/block/block_discontinuity.cuh b/external/cub/cub/block/block_discontinuity.cuh
new file mode 100644
index 00000000000..428882f70ab
--- /dev/null
+++ b/external/cub/cub/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/block/block_exchange.cuh b/external/cub/cub/block/block_exchange.cuh
new file mode 100644
index 00000000000..c0e32fda555
--- /dev/null
+++ b/external/cub/cub/block/block_exchange.cuh
@@ -0,0 +1,1248 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_histogram.cuh b/external/cub/cub/block/block_histogram.cuh
new file mode 100644
index 00000000000..5d393c2353f
--- /dev/null
+++ b/external/cub/cub/block/block_histogram.cuh
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_load.cuh b/external/cub/cub/block/block_load.cuh
new file mode 100644
index 00000000000..234dad295a0
--- /dev/null
+++ b/external/cub/cub/block/block_load.cuh
@@ -0,0 +1,1268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_radix_rank.cuh b/external/cub/cub/block/block_radix_rank.cuh
new file mode 100644
index 00000000000..77500ba0ede
--- /dev/null
+++ b/external/cub/cub/block/block_radix_rank.cuh
@@ -0,0 +1,697 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                lane_id         = LaneId();
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/block/block_radix_sort.cuh b/external/cub/cub/block/block_radix_sort.cuh
new file mode 100644
index 00000000000..736fbde746a
--- /dev/null
+++ b/external/cub/cub/block/block_radix_sort.cuh
@@ -0,0 +1,862 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ *   <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_raking_layout.cuh b/external/cub/cub/block/block_raking_layout.cuh
new file mode 100644
index 00000000000..ab6b71036cd
--- /dev/null
+++ b/external/cub/cub/block/block_raking_layout.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_reduce.cuh b/external/cub/cub/block/block_reduce.cuh
new file mode 100644
index 00000000000..a9de9e71742
--- /dev/null
+++ b/external/cub/cub/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_scan.cuh b/external/cub/cub/block/block_scan.cuh
new file mode 100644
index 00000000000..245084cff61
--- /dev/null
+++ b/external/cub/cub/block/block_scan.cuh
@@ -0,0 +1,2126 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_shuffle.cuh b/external/cub/cub/block/block_shuffle.cuh
new file mode 100644
index 00000000000..504f00e3552
--- /dev/null
+++ b/external/cub/cub/block/block_shuffle.cuh
@@ -0,0 +1,305 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
+            output = temp_storage[linear_tid + distance].prev;
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/block_store.cuh b/external/cub/cub/block/block_store.cuh
new file mode 100644
index 00000000000..63039afa8e5
--- /dev/null
+++ b/external/cub/cub/block/block_store.cuh
@@ -0,0 +1,1000 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_histogram_atomic.cuh b/external/cub/cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 00000000000..4599c092568
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_histogram_sort.cuh b/external/cub/cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 00000000000..b9ad6fb79c5
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_reduce_raking.cuh b/external/cub/cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 00000000000..c2c26651796
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
+                    partial,
+                    num_valid,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 00000000000..ee2294607e9
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 00000000000..68495b4e77e
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+    unsigned int warp_id;
+    unsigned int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum        reduction_op;
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < num_valid) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
+                            LOGICAL_WARP_SIZE :
+                            (warp_offset < static_cast<unsigned int>(num_valid)) ?
+                                num_valid - warp_offset :
+                                0;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_scan_raking.cuh b/external/cub/cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 00000000000..2e21324c9ee
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,666 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 00000000000..9252c0a3a7f
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 00000000000..eb0a3a1b54e
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 00000000000..18bd585823a
--- /dev/null
+++ b/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,418 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/cub.cuh b/external/cub/cub/cub.cuh
new file mode 100644
index 00000000000..b1c8e3200ab
--- /dev/null
+++ b/external/cub/cub/cub.cuh
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/external/cub/cub/device/device_histogram.cuh b/external/cub/cub/device/device_histogram.cuh
new file mode 100644
index 00000000000..db131eee764
--- /dev/null
+++ b/external/cub/cub/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_partition.cuh b/external/cub/cub/device/device_partition.cuh
new file mode 100644
index 00000000000..154506edcc0
--- /dev/null
+++ b/external/cub/cub/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_radix_sort.cuh b/external/cub/cub/device/device_radix_sort.cuh
new file mode 100644
index 00000000000..fe6cad65d7b
--- /dev/null
+++ b/external/cub/cub/device/device_radix_sort.cuh
@@ -0,0 +1,796 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_reduce.cuh b/external/cub/cub/device/device_reduce.cuh
new file mode 100644
index 00000000000..3939a7ee7bf
--- /dev/null
+++ b/external/cub/cub/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_run_length_encode.cuh b/external/cub/cub/device/device_run_length_encode.cuh
new file mode 100644
index 00000000000..ed0bf9c7d67
--- /dev/null
+++ b/external/cub/cub/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_scan.cuh b/external/cub/cub/device/device_scan.cuh
new file mode 100644
index 00000000000..4589279eeb6
--- /dev/null
+++ b/external/cub/cub/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_scan.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_segmented_radix_sort.cuh b/external/cub/cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 00000000000..7f8bf8e7b3c
--- /dev/null
+++ b/external/cub/cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,875 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
+ * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_segmented_reduce.cuh b/external/cub/cub/device/device_segmented_reduce.cuh
new file mode 100644
index 00000000000..1964ec1f1c4
--- /dev/null
+++ b/external/cub/cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_select.cuh b/external/cub/cub/device/device_select.cuh
new file mode 100644
index 00000000000..58bfe82ba30
--- /dev/null
+++ b/external/cub/cub/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/device_spmv.cuh b/external/cub/cub/device/device_spmv.cuh
new file mode 100644
index 00000000000..8f3a4c5cc05
--- /dev/null
+++ b/external/cub/cub/device/device_spmv.cuh
@@ -0,0 +1,174 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_histogram.cuh b/external/cub/cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 00000000000..cdebd8b8555
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1096 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 500)
+        {
+            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 350)
+        {
+            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 110)
+        {
+            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+        }
+        else
+        {
+            // No global atomic support
+            return cudaErrorNotSupported;
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS];
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 00000000000..f9793ebd53e
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1652 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<
+            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,
+            KeyT,
+            OffsetT>
+        BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+
+        // Relative size of KeyT type to a 4-byte word
+        SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+    };
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 6,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128,   CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 7,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT,       LOAD_DEFAULT,       RADIX_RANK_MATCH,   BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE,    LOAD_DEFAULT,   RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 6,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = 6,
+            SEGMENTED_RADIX_BITS    = 6,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  UpsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>  AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DispatchRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3];
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT>           ///< Signed integer type for global offsets
+struct DispatchSegmentedRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
+                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
+
+            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_reduce.cuh b/external/cub/cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 00000000000..b6aa44cc0e5
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,882 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OuputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OuputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor 
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OuputT,            ///< Data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread
+                2,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                       ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(128, 8, OuputT),     ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                CUB_NOMINAL_CONFIG(256, 16, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchReduce :
+    DeviceReducePolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Data type of output iterator
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchSegmentedReduce :
+    DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 00000000000..672bc49393a
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+        }
+        else
+        {
+            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_rle.cuh b/external/cub/cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 00000000000..1de979e88cd
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,538 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        device_rle_config.template Init<PtxRleSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+        }
+        else
+        {
+            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_scan.cuh b/external/cub/cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 00000000000..8944dcd33e0
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,563 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor 
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT>            ///< Signed integer type for global offsets
+struct DispatchScan
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM600
+    struct Policy600
+    {
+        typedef AgentScanPolicy<
+            CUB_NOMINAL_CONFIG(128, 15, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM520
+    struct Policy520
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(256, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(96, 21, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        typedef AgentScanPolicy<
+                CUB_NOMINAL_CONFIG(64, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 520)
+    typedef Policy520 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &scan_kernel_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        scan_kernel_config.template Init<PtxAgentScanPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
+        }
+        else if (ptx_version >= 520)
+        {
+            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+        }
+        else
+        {
+            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename            ScanInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
+        typename            ScanSweepKernelPtrT>    ///< Function type of cub::DeviceScanKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT             scan_op,                ///< [in] Binary scan functor 
+        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
+        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_out;
+        (void)scan_op;
+        (void)init_value;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)init_kernel;
+        (void)scan_kernel;
+        (void)scan_kernel_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                scan_kernel_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor 
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig scan_kernel_config;
+            InitConfigs(ptx_version, scan_kernel_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                scan_op,
+                init_value,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceScanInitKernel<ScanTileStateT>,
+                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
+                scan_kernel_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_select_if.cuh b/external/cub/cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 00000000000..6f033197c2d
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        select_if_config.template Init<PtxSelectIfPolicyT>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+        }
+        else
+        {
+            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 00000000000..3417913c7d8
--- /dev/null
+++ b/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,834 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false, 
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        spmv_config.template Init<PtxSpmvPolicyT>();
+        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 500)
+        {
+            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 370)
+        {
+            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+
+        }
+        else if (ptx_version >= 200)
+        {
+            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+        }
+        else
+        {
+            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+#if (CUB_PTX_ARCH == 0)
+            // Init textures
+            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+#endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+#if (CUB_PTX_ARCH == 0)
+            // Free textures
+            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+#endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/grid/grid_barrier.cuh b/external/cub/cub/grid/grid_barrier.cuh
new file mode 100644
index 00000000000..d9f83360b9e
--- /dev/null
+++ b/external/cub/cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/grid/grid_even_share.cuh b/external/cub/cub/grid/grid_even_share.cuh
new file mode 100644
index 00000000000..3ba29da7ae6
--- /dev/null
+++ b/external/cub/cub/grid/grid_even_share.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/grid/grid_mapping.cuh b/external/cub/cub/grid/grid_mapping.cuh
new file mode 100644
index 00000000000..6cd89209f83
--- /dev/null
+++ b/external/cub/cub/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/grid/grid_queue.cuh b/external/cub/cub/grid/grid_queue.cuh
new file mode 100644
index 00000000000..f413c6d2c4a
--- /dev/null
+++ b/external/cub/cub/grid/grid_queue.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        OffsetT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+#endif
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        fill_size = d_counters[FILL];
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+#endif
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/external/cub/cub/host/mutex.cuh b/external/cub/cub/host/mutex.cuh
new file mode 100644
index 00000000000..0054f1f916d
--- /dev/null
+++ b/external/cub/cub/host/mutex.cuh
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+
+#pragma once
+
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+    void TryLock()
+    {
+        mtx.try_lock();
+    }
+
+#else       //__cplusplus > 199711L
+
+    #if defined(_MSC_VER)
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // defined(_MSC_VER)
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // __cplusplus > 199711L
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/external/cub/cub/iterator/arg_index_input_iterator.cuh b/external/cub/cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 00000000000..d3bce583d8c
--- /dev/null
+++ b/external/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/cache_modified_input_iterator.cuh b/external/cub/cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 00000000000..0c0252c8b1a
--- /dev/null
+++ b/external/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/cache_modified_output_iterator.cuh b/external/cub/cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 00000000000..8dbaafa61c5
--- /dev/null
+++ b/external/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/constant_input_iterator.cuh b/external/cub/cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 00000000000..0b7af478d74
--- /dev/null
+++ b/external/cub/cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/counting_input_iterator.cuh b/external/cub/cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 00000000000..3b42a00d181
--- /dev/null
+++ b/external/cub/cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/discard_output_iterator.cuh b/external/cub/cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 00000000000..1fca08c062d
--- /dev/null
+++ b/external/cub/cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/tex_obj_input_iterator.cuh b/external/cub/cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 00000000000..623609452fd
--- /dev/null
+++ b/external/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return cudaDestroyTextureObject(tex_obj);
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Move array of uninitialized words, then alias and assign to return value
+        TextureWord words[TEXTURE_MULTIPLE];
+
+        #pragma unroll
+        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+        {
+            words[i] = tex1Dfetch<TextureWord>(
+                tex_obj,
+                (tex_offset * TEXTURE_MULTIPLE) + i);
+        }
+
+        // Load from words
+        return *reinterpret_cast<T*>(words);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/tex_ref_input_iterator.cuh b/external/cub/cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 00000000000..da1fd166177
--- /dev/null
+++ b/external/cub/cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,374 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIteratorTinstance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+/*
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+*/
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Use the texture reference
+        return TexId::Fetch(tex_offset);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDA_VERSION
diff --git a/external/cub/cub/iterator/transform_input_iterator.cuh b/external/cub/cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 00000000000..39258a40c9b
--- /dev/null
+++ b/external/cub/cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_load.cuh b/external/cub/cub/thread/thread_load.cuh
new file mode 100644
index 00000000000..9de4bd4149b
--- /dev/null
+++ b/external/cub/cub/thread/thread_load.cuh
@@ -0,0 +1,438 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include <iterator>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+/*
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+*/
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_operators.cuh b/external/cub/cub/thread/thread_operators.cuh
new file mode 100644
index 00000000000..2bd5403e864
--- /dev/null
+++ b/external/cub/cub/thread/thread_operators.cuh
@@ -0,0 +1,317 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_reduce.cuh b/external/cub/cub/thread/thread_reduce.cuh
new file mode 100644
index 00000000000..9e277050236
--- /dev/null
+++ b/external/cub/cub/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_scan.cuh b/external/cub/cub/thread/thread_scan.cuh
new file mode 100644
index 00000000000..545b4141918
--- /dev/null
+++ b/external/cub/cub/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_search.cuh b/external/cub/cub/thread/thread_search.cuh
new file mode 100644
index 00000000000..379a08a51e7
--- /dev/null
+++ b/external/cub/cub/thread/thread_search.cuh
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_store.cuh b/external/cub/cub/thread/thread_store.cuh
new file mode 100644
index 00000000000..14ee84d9270
--- /dev/null
+++ b/external/cub/cub/thread/thread_store.cuh
@@ -0,0 +1,422 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_allocator.cuh b/external/cub/cub/util_allocator.cuh
new file mode 100644
index 00000000000..24c7a79fee5
--- /dev/null
+++ b/external/cub/cub/util_allocator.cuh
@@ -0,0 +1,708 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+        else
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_arch.cuh b/external/cub/cub/util_arch.cuh
new file mode 100644
index 00000000000..5ec36e5f1f7
--- /dev/null
+++ b/external/cub/cub/util_arch.cuh
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+#ifndef CUB_PTX_ARCH
+    #ifndef __CUDA_ARCH__
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
+#ifndef CUB_BLOCK_THREADS
+    #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
+        (CUB_MIN(                                                                           \
+            NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
+            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
+#endif
+
+/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
+#ifndef CUB_ITEMS_PER_THREAD
+    #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
+	    (CUB_MIN(                                                                                       \
+	        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                            \
+	        CUB_MAX(                                                                                    \
+	            1,                                                                                      \
+	            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
+#endif
+
+/// Define both nominal threads-per-block and items-per-thread
+#ifndef CUB_NOMINAL_CONFIG
+    #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)    \
+        CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                \
+        CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
+#endif
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_debug.cuh b/external/cub/cub/util_debug.cuh
new file mode 100644
index 00000000000..1ad60cf2db6
--- /dev/null
+++ b/external/cub/cub/util_debug.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+#ifdef CUB_STDERR
+    if (error)
+    {
+    #if (CUB_PTX_ARCH == 0)
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    #elif (CUB_PTX_ARCH >= 200)
+        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+    #endif
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_device.cuh b/external/cub/cub/util_device.cuh
new file mode 100644
index 00000000000..fa73dbd74f1
--- /dev/null
+++ b/external/cub/cub/util_device.cuh
@@ -0,0 +1,347 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+{
+    struct Dummy
+    {
+        /// Type definition of the EmptyKernel kernel entry point
+        typedef void (*EmptyKernelPtr)();
+
+        /// Force EmptyKernel<void> to be generated if this class is used
+        CUB_RUNTIME_FUNCTION __forceinline__
+        EmptyKernelPtr Empty()
+        {
+            return EmptyKernel<void>;
+        }
+    };
+
+
+#ifndef CUB_RUNTIME_ENABLED
+    (void)ptx_version;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#elif (CUB_PTX_ARCH > 0)
+
+    ptx_version = CUB_PTX_ARCH;
+    return cudaSuccess;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        cudaFuncAttributes empty_kernel_attrs;
+        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+/**
+ * \brief Retrieves the SM version (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)sm_version;
+    (void)device_ordinal;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        // Fill in SM version
+        int major, minor;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Synchronize the stream if specified
+ */
+CUB_RUNTIME_FUNCTION __forceinline__
+static cudaError_t SyncStream(cudaStream_t stream)
+{
+#if (CUB_PTX_ARCH == 0)
+    return cudaStreamSynchronize(stream);
+#else
+    (void)stream;
+    // Device can't yet sync on a specific stream
+    return cudaDeviceSynchronize();
+#endif
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes);
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT &op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+#endif  // Do not document
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_macro.cuh b/external/cub/cub/util_macro.cuh
new file mode 100644
index 00000000000..73c29d22c5c
--- /dev/null
+++ b/external/cub/cub/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_namespace.cuh b/external/cub/cub/util_namespace.cuh
new file mode 100644
index 00000000000..edb61260669
--- /dev/null
+++ b/external/cub/cub/util_namespace.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
diff --git a/external/cub/cub/util_ptx.cuh b/external/cub/cub/util_ptx.cuh
new file mode 100644
index 00000000000..fae6e4fae2e
--- /dev/null
+++ b/external/cub/cub/util_ptx.cuh
@@ -0,0 +1,729 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x >> shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x << shift) + addend;
+#endif
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+#if CUB_PTX_ARCH >= 200
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+#else
+    const unsigned int MASK = (1 << num_bits) - 1;
+    bits = (source >> bit_start) & MASK;
+#endif
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+#else
+    x <<= bit_start;
+    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
+    unsigned int MASK_Y = ~MASK_X;
+    ret = (y & MASK_Y) | (x & MASK_X);
+#endif
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+#else
+    x = x + y + z;
+#endif
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_lane,         ///< [in] Index of first lane in segment (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_lane,          ///< [in] Index of first lane in segment (typically 31)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    int             logical_warp_threads,   ///< [in] Number of threads per logical warp
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 logical_warp_threads - 1,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     logical_warp_threads - 1,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_type.cuh b/external/cub/cub/util_type.cuh
new file mode 100644
index 00000000000..7de5427fa7a
--- /dev/null
+++ b/external/cub/cub/util_type.cuh
@@ -0,0 +1,1141 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 00000000000..682a5bfedc2
--- /dev/null
+++ b/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,551 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    // Creates a mask where the last thread in each logical warp is set
+    template <int WARP, int WARPS>
+    struct LastLaneMask
+    {
+        enum {
+            BASE_MASK   = 1 << (LOGICAL_WARP_THREADS - 1),
+            MASK        = (LastLaneMask<WARP + 1, WARPS>::MASK << LOGICAL_WARP_THREADS) | BASE_MASK,
+        };
+    };
+
+    // Creates a mask where the last thread in each logical warp is set
+    template <int WARP>
+    struct LastLaneMask<WARP, WARP>
+    {
+        enum {
+            MASK        = 1 << (LOGICAL_WARP_THREADS - 1),
+        };
+    };
+
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+
+    unsigned int lane_id;
+
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    :
+        lane_id(LaneId()),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask);
+        
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value, 
+            cub::Sum(), 
+            last_lane, 
+            offset, 
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        // Get the lane of the first and last thread in the logical warp
+        int first_thread   = 0;
+        int last_thread    = LOGICAL_WARP_THREADS - 1;
+        if (!IS_ARCH_WARP)
+        {
+            first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
+            last_thread |= lane_id;
+        }
+
+        // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
+        int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE;
+
+        // Get the last valid lane
+        int last_lane = (ALL_LANES_VALID) ?
+            last_thread :
+            CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask in the last lanes of each logical warp
+        warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_reduce_smem.cuh b/external/cub/cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 00000000000..9ba8e94d12d
--- /dev/null
+++ b/external/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,      ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 /*folded_items_per_warp*/,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_scan_shfl.cuh b/external/cub/cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 00000000000..f0deb8ddefc
--- /dev/null
+++ b/external/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,656 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8,
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    unsigned int lane_id;
+
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    :
+        lane_id(LaneId()),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename _T, typename ScanOp, int STEP>
+    __device__ __forceinline__ void InclusiveScanStep(
+        _T&             input,              ///< [in] Calling thread's input item.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        Int2Type<STEP>  /*step*/)               ///< [in] Marker type indicating scan step
+    {
+        input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+
+        InclusiveScanStep(input, scan_op, first_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename _T, typename ScanOp>
+    __device__ __forceinline__ void InclusiveScanStep(
+        _T&             /*input*/,              ///< [in] Calling thread's input item.
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        int             /*first_lane*/,         ///< [in] Index of first lane in segment
+        Int2Type<STEPS> /*step*/)               ///< [in] Marker type indicating scan step
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+//        InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>());
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+//        InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>());
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
+
+        unsigned int segment_id = (IS_ARCH_WARP) ?
+            lane_id :
+            lane_id % LOGICAL_WARP_THREADS;
+
+        if (segment_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_scan_smem.cuh b/external/cub/cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 00000000000..c3a7a94ba26
--- /dev/null
+++ b/external/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/warp_reduce.cuh b/external/cub/cub/warp/warp_reduce.cuh
new file mode 100644
index 00000000000..ef78dd6a009
--- /dev/null
+++ b/external/cub/cub/warp/warp_reduce.cuh
@@ -0,0 +1,612 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/warp_scan.cuh b/external/cub/cub/warp/warp_scan.cuh
new file mode 100644
index 00000000000..3f78ca8a090
--- /dev/null
+++ b/external/cub/cub/warp/warp_scan.cuh
@@ -0,0 +1,936 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/eclipse code style profile.xml b/external/cub/eclipse code style profile.xml
new file mode 100644
index 00000000000..3ca7f771cc2
--- /dev/null
+++ b/external/cub/eclipse code style profile.xml	
@@ -0,0 +1,155 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<profiles version="1">
+<profile kind="CodeFormatterProfile" name="B40C" version="1">
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.lineSplit" value="80"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_enumerator_list" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_declarator_list" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_empty_lines" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_method_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_type_declaration" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_expression_list" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_conditional_expression" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer" value="1"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line" value="true"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.indentation.size" value="4"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration" value="end_of_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.brace_position_for_array_initializer" value="next_line"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters" value="insert"/>
+<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments" value="do not insert"/>
+</profile>
+</profiles>
diff --git a/external/cub/examples/block/Makefile b/external/cub/examples/block/Makefile
new file mode 100644
index 00000000000..753931b3407
--- /dev/null
+++ b/external/cub/examples/block/Makefile
@@ -0,0 +1,128 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../../common.mk 
+ 
+ 
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+		
+ALL = 	example_block_radix_sort \
+	 	example_block_reduce \
+	 	example_block_scan
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
+
+
+
+
+#-------------------------------------------------------------------------------
+# make example_block_reduce
+#-------------------------------------------------------------------------------
+
+example_block_reduce: bin/example_block_reduce_$(BIN_SUFFIX)
+
+bin/example_block_reduce_$(BIN_SUFFIX) : example_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_reduce_$(BIN_SUFFIX) example_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_block_scan
+#-------------------------------------------------------------------------------
+
+example_block_scan: bin/example_block_scan_$(BIN_SUFFIX)
+
+bin/example_block_scan_$(BIN_SUFFIX) : example_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_scan_$(BIN_SUFFIX) example_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_block_radix_sort
+#-------------------------------------------------------------------------------
+
+example_block_radix_sort: bin/example_block_radix_sort_$(BIN_SUFFIX)
+
+bin/example_block_radix_sort_$(BIN_SUFFIX) : example_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_radix_sort_$(BIN_SUFFIX) example_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
diff --git a/external/cub/examples/block/example_block_radix_sort.cu b/external/cub/examples/block/example_block_radix_sort.cu
new file mode 100644
index 00000000000..0bceb831ccf
--- /dev/null
+++ b/external/cub/examples/block/example_block_radix_sort.cu
@@ -0,0 +1,323 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockRadixSort
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <algorithm>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+/// Uniform key samples
+bool g_uniform_keys;
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide sorting over integers
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void BlockSortKernel(
+    Key         *d_in,          // Tile of input
+    Key         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage        load;
+        typename BlockRadixSortT::TempStorage   sort;
+    } temp_storage;
+
+    // Per-thread tile items
+    Key items[ITEMS_PER_THREAD];
+
+    // Our current block's offset
+    int block_offset = blockIdx.x * TILE_SIZE;
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Sort keys
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store output in striped fashion
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+    // Store elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize sorting problem (and solution).
+ */
+template <typename Key>
+void Initialize(
+    Key *h_in,
+    Key *h_reference,
+    int num_items,
+    int tile_size)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (g_uniform_keys)
+        {
+            h_in[i] = 0;
+        }
+        else
+        {
+            RandomBits(h_in[i]);
+        }
+        h_reference[i] = h_in[i];
+    }
+
+    // Only sort the first tile
+    std::sort(h_reference, h_reference + tile_size);
+}
+
+
+/**
+ * Test BlockScan
+ */
+template <
+    typename    Key,
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    Key *h_in               = new Key[TILE_SIZE * g_grid_size];
+    Key *h_reference        = new Key[TILE_SIZE * g_grid_size];
+    clock_t *h_elapsed      = new clock_t[g_grid_size];
+
+    // Initialize problem and reference output on host
+    Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
+
+    // Initialize device arrays
+    Key *d_in       = NULL;
+    Key *d_out      = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(cudaMalloc((void**)&d_in,          sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_out,         sizeof(Key) * TILE_SIZE * g_grid_size));
+    CubDebugExit(cudaMalloc((void**)&d_elapsed,     sizeof(clock_t) * g_grid_size));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            std::cout << h_in[i] << ", ";
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
+
+    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+    fflush(stdout);
+
+    // Run kernel once to prime caches and check result
+    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    fflush(stdout);
+
+    // Run this several times and average the performance results
+    GpuTimer            timer;
+    float               elapsed_millis          = 0.0;
+    unsigned long long  elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        timer.Start();
+
+        // Run kernel
+        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
+        for (int i = 0; i < g_grid_size; i++)
+            elapsed_clocks += h_elapsed[i];
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
+    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+    fflush(stdout);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_elapsed) delete[] h_elapsed;
+    if (d_in) CubDebugExit(cudaFree(d_in));
+    if (d_out) CubDebugExit(cudaFree(d_out));
+    if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_uniform_keys = args.CheckCmdLineFlag("uniform");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    fflush(stdout);
+
+    // Run tests
+    printf("\nuint32:\n"); fflush(stdout);
+    Test<unsigned int, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nfp32:\n"); fflush(stdout);
+    Test<float, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    printf("\nuint8:\n"); fflush(stdout);
+    Test<unsigned char, 128, 13>();
+    printf("\n"); fflush(stdout);
+
+    return 0;
+}
+
diff --git a/external/cub/examples/block/example_block_reduce.cu b/external/cub/examples/block/example_block_reduce.cu
new file mode 100644
index 00000000000..8e30ef23296
--- /dev/null
+++ b/external/cub/examples/block/example_block_reduce.cu
@@ -0,0 +1,290 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockReduce
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+__global__ void BlockSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile aggregate
+    clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
+{
+    // Specialize BlockReduce type for our thread block
+    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
+
+    // Shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute sum
+    int aggregate = BlockReduceT(temp_storage).Sum(data);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        *d_out = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(int *h_in, int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block reduction
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockReduceAlgorithm    ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * 1);
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations>] "
+            "[--grid-size=<grid size>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_REDUCE_RAKING>();
+    Test<512, 2, BLOCK_REDUCE_RAKING>();
+    Test<256, 4, BLOCK_REDUCE_RAKING>();
+    Test<128, 8, BLOCK_REDUCE_RAKING>();
+    Test<64, 16, BLOCK_REDUCE_RAKING>();
+    Test<32, 32, BLOCK_REDUCE_RAKING>();
+    Test<16, 64, BLOCK_REDUCE_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
+
+    return 0;
+}
+
diff --git a/external/cub/examples/block/example_block_scan.cu b/external/cub/examples/block/example_block_scan.cu
new file mode 100644
index 00000000000..74729f8e95b
--- /dev/null
+++ b/external/cub/examples/block/example_block_scan.cu
@@ -0,0 +1,334 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple demonstration of cub::BlockScan
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console (define before including cub.h)
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/block/block_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+/// Verbose output
+bool g_verbose = false;
+
+/// Timing iterations
+int g_timing_iterations = 100;
+
+/// Default grid size
+int g_grid_size = 1;
+
+
+
+//---------------------------------------------------------------------
+// Kernels
+//---------------------------------------------------------------------
+
+/**
+ * Simple kernel for performing a block-wide exclusive prefix sum over integers
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockScanAlgorithm      ALGORITHM>
+__global__ void BlockPrefixSumKernel(
+    int         *d_in,          // Tile of input
+    int         *d_out,         // Tile of output
+    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
+{
+    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
+
+    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
+    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
+
+    // Specialize BlockScan type for our thread block
+    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;
+        typename BlockStoreT::TempStorage   store;
+        typename BlockScanT::TempStorage    scan;
+    } temp_storage;
+
+    // Per-thread tile data
+    int data[ITEMS_PER_THREAD];
+
+    // Load items into a blocked arrangement
+    BlockLoadT(temp_storage.load).Load(d_in, data);
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Compute exclusive prefix sum
+    int aggregate;
+    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
+
+    // Stop cycle timer
+    clock_t stop = clock();
+
+    // Barrier for smem reuse
+    __syncthreads();
+
+    // Store items from a blocked arrangement
+    BlockStoreT(temp_storage.store).Store(d_out, data);
+
+    // Store aggregate and elapsed clocks
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utilities
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive prefix sum problem (and solution).
+ * Returns the aggregate
+ */
+int Initialize(
+    int *h_in,
+    int *h_reference,
+    int num_items)
+{
+    int inclusive = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_in[i] = i % 17;
+
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * Test thread block scan
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockScanAlgorithm  ALGORITHM>
+void Test()
+{
+    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    int *h_in           = new int[TILE_SIZE];
+    int *h_reference    = new int[TILE_SIZE];
+    int *h_gpu          = new int[TILE_SIZE + 1];
+
+    // Initialize problem and reference output on host
+    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
+
+    // Initialize device arrays
+    int *d_in           = NULL;
+    int *d_out          = NULL;
+    clock_t *d_elapsed  = NULL;
+    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
+    cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
+    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+            printf("%d, ", h_in[i]);
+        printf("\n\n");
+    }
+
+    // Kernel props
+    int max_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
+
+    // Copy problem to device
+    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
+        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
+        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
+
+    // Run aggregate/prefix kernel
+    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+        d_in,
+        d_out,
+        d_elapsed);
+
+    // Check results
+    printf("\tOutput items: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check total aggregate
+    printf("\tAggregate: ");
+    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Run this several times and average the performance results
+    GpuTimer    timer;
+    float       elapsed_millis          = 0.0;
+    clock_t     elapsed_clocks          = 0;
+
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Copy problem to device
+        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
+
+        timer.Start();
+
+        // Run aggregate/prefix kernel
+        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            d_elapsed);
+
+        timer.Stop();
+        elapsed_millis += timer.ElapsedMillis();
+
+        // Copy clocks from device
+        clock_t clocks;
+        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
+        elapsed_clocks += clocks;
+
+    }
+
+    // Check for kernel errors and STDIO from the kernel, if any
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Display timing results
+    float avg_millis            = elapsed_millis / g_timing_iterations;
+    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
+    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
+    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
+
+    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
+    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
+    printf("\tAverage kernel millis: %.4f\n", avg_millis);
+    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_gpu) delete[] h_gpu;
+    if (d_in) cudaFree(d_in);
+    if (d_out) cudaFree(d_out);
+    if (d_elapsed) cudaFree(d_elapsed);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("grid-size", g_grid_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations (default:%d)>]"
+            "[--grid-size=<grid size (default:%d)>]"
+            "[--v] "
+            "\n", argv[0], g_timing_iterations, g_grid_size);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run tests
+    Test<1024, 1, BLOCK_SCAN_RAKING>();
+    Test<512, 2, BLOCK_SCAN_RAKING>();
+    Test<256, 4, BLOCK_SCAN_RAKING>();
+    Test<128, 8, BLOCK_SCAN_RAKING>();
+    Test<64, 16, BLOCK_SCAN_RAKING>();
+    Test<32, 32, BLOCK_SCAN_RAKING>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
+    Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
+
+    printf("-------------\n");
+
+    Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
+    Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
+    Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
+    Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
+    Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
+    Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
+
+
+    return 0;
+}
+
diff --git a/external/cub/examples/block/reduce_by_key.cu b/external/cub/examples/block/reduce_by_key.cu
new file mode 100644
index 00000000000..d74e1624423
--- /dev/null
+++ b/external/cub/examples/block/reduce_by_key.cu
@@ -0,0 +1,57 @@
+
+
+#include <cub/cub.cuh>
+
+
+template <
+    int         BLOCK_THREADS,          ///< Number of CTA threads
+    typename    KeyT,                   ///< Key type
+    typename    ValueT>                 ///< Value type
+__global__ void Kernel()
+{
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef cub::KeyValuePair<int, ValueT> OffsetValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef cub::ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockDiscontinuity type for setting head flags
+    typedef cub::BlockDiscontinuity<
+            KeyT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeysT;
+
+    // Parameterized BlockScan type
+    typedef cub::BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            cub::BLOCK_SCAN_WARP_SCANS>
+        BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage
+    {
+        typename BlockScanT::TempStorage                scan;           // Scan storage
+        typename BlockDiscontinuityKeysT::TempStorage   discontinuity;  // Discontinuity storage
+    } temp_storage;
+
+
+    // Read data (each thread gets 3 items each, every 9 items is a segment)
+    KeyT    my_keys[3]      = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3};
+    ValueT  my_values[3]    = {1, 1, 1};
+
+    // Set head segment head flags
+    int     my_flags[3];
+    BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads(
+        my_flags,
+        my_keys,
+        cub::Inequality());
+
+    __syncthreads();
+
+
+
+
+
+
+}
diff --git a/external/cub/examples/device/Makefile b/external/cub/examples/device/Makefile
new file mode 100644
index 00000000000..45b6209baf9
--- /dev/null
+++ b/external/cub/examples/device/Makefile
@@ -0,0 +1,197 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../../common.mk 
+ 
+ 
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+		
+ALL = 	example_device_partition_flagged \
+		example_device_partition_if \
+	 	example_device_radix_sort \
+		example_device_reduce \
+	 	example_device_scan \
+	 	example_device_select_unique \
+		example_device_select_flagged \
+		example_device_select_if \
+		example_device_sort_find_non_trivial_runs
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
+
+
+#-------------------------------------------------------------------------------
+# make example_device_reduce
+#-------------------------------------------------------------------------------
+
+example_device_reduce: bin/example_device_reduce_$(BIN_SUFFIX)
+
+bin/example_device_reduce_$(BIN_SUFFIX) : example_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_reduce_$(BIN_SUFFIX) example_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_partition_flagged
+#-------------------------------------------------------------------------------
+
+example_device_partition_flagged: bin/example_device_partition_flagged_$(BIN_SUFFIX)
+
+bin/example_device_partition_flagged_$(BIN_SUFFIX) : example_device_partition_flagged.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_flagged_$(BIN_SUFFIX) example_device_partition_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_partition_if
+#-------------------------------------------------------------------------------
+
+example_device_partition_if: bin/example_device_partition_if_$(BIN_SUFFIX)
+
+bin/example_device_partition_if_$(BIN_SUFFIX) : example_device_partition_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_if_$(BIN_SUFFIX) example_device_partition_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_scan
+#-------------------------------------------------------------------------------
+
+example_device_scan: bin/example_device_scan_$(BIN_SUFFIX)
+
+bin/example_device_scan_$(BIN_SUFFIX) : example_device_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_scan_$(BIN_SUFFIX) example_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_radix_sort
+#-------------------------------------------------------------------------------
+
+example_device_radix_sort: bin/example_device_radix_sort_$(BIN_SUFFIX)
+
+bin/example_device_radix_sort_$(BIN_SUFFIX) : example_device_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_radix_sort_$(BIN_SUFFIX) example_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_select_unique
+#-------------------------------------------------------------------------------
+
+example_device_select_unique: bin/example_device_select_unique_$(BIN_SUFFIX)
+
+bin/example_device_select_unique_$(BIN_SUFFIX) : example_device_select_unique.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_unique_$(BIN_SUFFIX) example_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_select_flagged
+#-------------------------------------------------------------------------------
+
+example_device_select_flagged: bin/example_device_select_flagged_$(BIN_SUFFIX)
+
+bin/example_device_select_flagged_$(BIN_SUFFIX) : example_device_select_flagged.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_flagged_$(BIN_SUFFIX) example_device_select_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make example_device_select_if
+#-------------------------------------------------------------------------------
+
+example_device_select_if: bin/example_device_select_if_$(BIN_SUFFIX)
+
+bin/example_device_select_if_$(BIN_SUFFIX) : example_device_select_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_if_$(BIN_SUFFIX) example_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make example_device_sort_find_non_trivial_runs
+#-------------------------------------------------------------------------------
+
+example_device_sort_find_non_trivial_runs: bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX)
+
+bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) : example_device_sort_find_non_trivial_runs.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) example_device_sort_find_non_trivial_runs.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+
diff --git a/external/cub/examples/device/example_device_partition_flagged.cu b/external/cub/examples/device/example_device_partition_flagged.cu
new file mode 100644
index 00000000000..0c9a6477a4d
--- /dev/null
+++ b/external/cub/examples/device/example_device_partition_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::Flagged().
+ *
+ * Partition flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_partition_if.cu b/external/cub/examples/device/example_device_partition_if.cu
new file mode 100644
index 00000000000..52ae2d6e49b
--- /dev/null
+++ b/external/cub/examples/device/example_device_partition_if.cu
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DevicePartition::If().
+ *
+ * Partitions items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_partition.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // DevicePartition a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_radix_sort.cu b/external/cub/examples/device/example_device_radix_sort.cu
new file mode 100644
index 00000000000..af5de82957c
--- /dev/null
+++ b/external/cub/examples/device/example_device_radix_sort.cu
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceRadixSort::SortPairs().
+ *
+ * Sorts an array of float keys paired with a corresponding array of int values.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for floating point types.  Distinguishes
+ * between positive and negative zero.
+ */
+struct Pair
+{
+    float   key;
+    int     value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Return true if key is negative zero and b.key is positive zero
+        unsigned int key_bits   = *reinterpret_cast<unsigned*>(const_cast<float*>(&key));
+        unsigned int b_key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&b.key));
+        unsigned int HIGH_BIT   = 1u << 31;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+void Initialize(
+    float           *h_keys,
+    int             *h_values,
+    float           *h_reference_keys,
+    int             *h_reference_values,
+    int             num_items)
+{
+    Pair *h_pairs = new Pair[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        RandomBits(h_keys[i]);
+        RandomBits(h_values[i]);
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Input values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
+        num_items, int(sizeof(float)), int(sizeof(int)));
+    fflush(stdout);
+
+    // Allocate host arrays
+    float   *h_keys             = new float[num_items];
+    float   *h_reference_keys   = new float[num_items];
+    int     *h_values           = new int[num_items];
+    int     *h_reference_values = new int[num_items];
+
+    // Initialize problem and solution on host
+    Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
+
+    // Allocate device arrays
+    DoubleBuffer<float> d_keys;
+    DoubleBuffer<int>   d_values;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Initialize device arrays
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Run
+    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_reference_keys) delete[] h_reference_keys;
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_reduce.cu b/external/cub/examples/device/example_device_reduce.cu
new file mode 100644
index 00000000000..8d160509ff8
--- /dev/null
+++ b/external/cub/examples/device/example_device_reduce.cu
@@ -0,0 +1,180 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceReduce::Sum().
+ *
+ * Sums an array of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int   *h_in,
+    int     num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Compute solution
+ */
+void Solve(
+    int           *h_in,
+    int           &h_reference,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (i == 0)
+            h_reference = h_in[0];
+        else
+            h_reference += h_in[i];
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int* h_in = new int[num_items];
+    int  h_reference;
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
+
+    // Request and allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_scan.cu b/external/cub/examples/device/example_device_scan.cu
new file mode 100644
index 00000000000..53f591cf654
--- /dev/null
+++ b/external/cub/examples/device/example_device_scan.cu
@@ -0,0 +1,186 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceScan::ExclusiveSum().
+ *
+ * Computes an exclusive sum of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+void Initialize(
+    int        *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+        h_in[i] = i;
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+int Solve(
+    int           *h_in,
+    int           *h_reference,
+    int             num_items)
+{
+    int inclusive = 0;
+    int aggregate = 0;
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference[i] = inclusive;
+        inclusive += h_in[i];
+        aggregate += h_in[i];
+    }
+
+    return aggregate;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = 150;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
+        num_items, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate host arrays
+    int*  h_in = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items);
+    Solve(h_in, h_reference, num_items);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array
+    int *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_select_flagged.cu b/external/cub/examples/device/example_device_select_flagged.cu
new file mode 100644
index 00000000000..00cf7a24f44
--- /dev/null
+++ b/external/cub/examples/device/example_device_select_flagged.cu
@@ -0,0 +1,233 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Flagged().
+ *
+ * Selects flagged items from from a sequence of int keys using a
+ * corresponding sequence of unsigned char flags.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting flags at distances of random length
+ * chosen from [1..max_segment]
+ */
+void Initialize(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             num_items,
+    int             max_segment)
+{
+    unsigned short max_short = (unsigned short) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_flags[j] = 0;
+            h_in[j] = key;
+            j++;
+        }
+
+        h_flags[i] = 1;
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("Flags:\n");
+        DisplayResults(h_flags, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int             *h_in,
+    unsigned char   *h_flags,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (h_flags[i])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int             *h_in        = new int[num_items];
+    int             *h_reference = new int[num_items];
+    unsigned char   *h_flags     = new unsigned char[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, h_flags, num_items, max_segment);
+    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int             *d_in = NULL;
+    unsigned char   *d_flags = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_select_if.cu b/external/cub/examples/device/example_device_select_if.cu
new file mode 100644
index 00000000000..5055f449d55
--- /dev/null
+++ b/external/cub/examples/device/example_device_select_if.cu
@@ -0,0 +1,242 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::If().
+ *
+ * Selects items from from a sequence of int keys using a
+ * section functor (greater-than)
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+/// Selection functor type
+struct GreaterThan
+{
+    int compare;
+
+    __host__ __device__ __forceinline__
+    GreaterThan(int compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const int &a) const {
+        return (a > compare);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <typename SelectOp>
+int Solve(
+    int             *h_in,
+    SelectOp        select_op,
+    int             *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if (select_op(h_in[i]))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int *h_in        = new int[num_items];
+    int *h_reference = new int[num_items];
+
+    // Select a pivot index
+    unsigned int pivot_index;
+    unsigned int max_int = (unsigned int) -1;
+    RandomBits(pivot_index);
+    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
+    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    GreaterThan select_op(h_in[pivot_index]);
+
+    int num_selected = Solve(h_in, select_op, h_reference, num_items);
+
+    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
+        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
diff --git a/external/cub/examples/device/example_device_select_unique.cu b/external/cub/examples/device/example_device_select_unique.cu
new file mode 100644
index 00000000000..b294a18c2c1
--- /dev/null
+++ b/external/cub/examples/device/example_device_select_unique.cu
@@ -0,0 +1,221 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of DeviceSelect::Unique().
+ *
+ * Selects the first element from each run of identical values from a sequence
+ * of int keys.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem, setting runs of random length chosen from [1..max_segment]
+ */
+void Initialize(
+    int     *h_in,
+    int     num_items,
+    int     max_segment)
+{
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
+        unsigned short max_short = (unsigned short) -1;
+        unsigned short repeat;
+        RandomBits(repeat);
+        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
+        repeat = CUB_MAX(1, repeat);
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            h_in[j] = key;
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+int Solve(
+    int         *h_in,
+    int         *h_reference,
+    int         num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = 150;
+    int max_segment         = 40;       // Maximum segment length
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxseg", max_segment);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays
+    int*  h_in        = new int[num_items];
+    int*  h_reference = new int[num_items];
+
+    // Initialize problem and solution
+    Initialize(h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
+        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    int *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+    // Allocate device output array and num selected
+    int     *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Run
+    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare ? "FAIL" : "PASS");
+    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    printf("\n\n");
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
new file mode 100644
index 00000000000..86d4ac55bd1
--- /dev/null
+++ b/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
@@ -0,0 +1,384 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple example of sorting a sequence of keys and values (each pair is a
+ * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
+ * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
+ *
+ * To compile using the command line:
+ *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+
+#include "../../test/test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Simple key-value pairing for using std::sort on key-value pairs.
+ */
+template <typename Key, typename Value>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Pair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
+{
+    os << '<' << val.key << ',' << val.value << '>';
+    return os;
+}
+
+
+/**
+ * Initialize problem
+ */
+template <typename Key, typename Value>
+void Initialize(
+    Key    *h_keys,
+    Value  *h_values,
+    int    num_items,
+    int    max_key)
+{
+    float scale = float(max_key) / float(UINT_MAX);
+    for (int i = 0; i < num_items; ++i)
+    {
+        Key sample;
+        RandomBits(sample);
+        h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample);
+        h_values[i] = i;
+    }
+
+    if (g_verbose)
+    {
+        printf("Keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+
+        printf("Values:\n");
+        DisplayResults(h_values, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve sorted non-trivial subrange problem.  Returns the number
+ * of non-trivial runs found.
+ */
+template <typename Key, typename Value>
+int Solve(
+    Key     *h_keys,
+    Value   *h_values,
+    int     num_items,
+    int     *h_offsets_reference,
+    int     *h_lengths_reference)
+{
+    // Sort
+
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    std::stable_sort(h_pairs, h_pairs + num_items);
+
+    if (g_verbose)
+    {
+        printf("Sorted pairs:\n");
+        DisplayResults(h_pairs, num_items);
+        printf("\n\n");
+    }
+
+    // Find non-trivial runs
+
+    Key     previous        = h_pairs[0].key;
+    int     length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (previous != h_pairs[i].key)
+        {
+            if (length > 1)
+            {
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_pairs[i].key;
+    }
+
+    if (length > 1)
+    {
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    delete[] h_pairs;
+
+    return num_runs;
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef unsigned int    Key;
+    typedef int             Value;
+
+    int timing_iterations   = 0;
+    int num_items           = 40;
+    Key max_key             = 20;       // Max item
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("maxkey", max_key);
+    args.GetCmdLineArgument("i", timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--i=<timing iterations> "
+            "[--n=<input items, default 40> "
+            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Allocate host arrays (problem and reference solution)
+
+    Key     *h_keys                 = new Key[num_items];
+    Value   *h_values               = new Value[num_items];
+    int     *h_offsets_reference    = new int[num_items];
+    int     *h_lengths_reference    = new int[num_items];
+
+    // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
+    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
+    fflush(stdout);
+
+    Initialize(h_keys, h_values, num_items, max_key);
+    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
+
+    printf("%d non-trivial runs\n", num_runs);
+    fflush(stdout);
+
+    // Repeat for performance timing
+    GpuTimer gpu_timer;
+    GpuTimer gpu_rle_timer;
+    float elapsed_millis = 0.0;
+    float elapsed_rle_millis = 0.0;
+    for (int i = 0; i <= timing_iterations; ++i)
+    {
+
+        // Allocate and initialize device arrays for sorting
+        DoubleBuffer<Key>       d_keys;
+        DoubleBuffer<Value>     d_values;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
+
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
+
+        // Start timer
+        gpu_timer.Start();
+
+        // Allocate temporary storage for sorting
+        size_t  temp_storage_bytes  = 0;
+        void    *d_temp_storage     = NULL;
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the sort
+        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
+
+        // Free unused buffers and sorting temporary storage
+        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
+        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+        // Start timer
+        gpu_rle_timer.Start();
+
+        // Allocate device arrays for enumerating non-trivial runs
+        int     *d_offests_out   = NULL;
+        int     *d_lengths_out   = NULL;
+        int     *d_num_runs      = NULL;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
+
+        // Allocate temporary storage for isolating non-trivial runs
+        d_temp_storage = NULL;
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Do the isolation
+        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys.d_buffers[d_keys.selector],
+            d_offests_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items));
+
+        // Free keys buffer
+        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
+
+        //
+        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
+        //
+
+        // Stop sort timer
+        gpu_timer.Stop();
+        gpu_rle_timer.Stop();
+
+        if (i == 0)
+        {
+            // First iteration is a warmup: // Check for correctness (and display results, if specified)
+
+            printf("\nRUN OFFSETS: \n");
+            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nRUN LENGTHS: \n");
+            compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            printf("\nNUM RUNS: \n");
+            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+            printf("\t\t %s ", compare ? "FAIL" : "PASS");
+
+            AssertEquals(0, compare);
+        }
+        else
+        {
+            elapsed_millis += gpu_timer.ElapsedMillis();
+            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
+        }
+
+        // GPU cleanup
+
+        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
+        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
+        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+    // Host cleanup
+    if (h_keys) delete[] h_keys;
+    if (h_values) delete[] h_values;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+
+    printf("\n\n");
+
+    if (timing_iterations > 0)
+    {
+        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
+            timing_iterations,
+            elapsed_millis / timing_iterations,
+            elapsed_rle_millis / timing_iterations);
+    }
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/experimental/Makefile b/external/cub/experimental/Makefile
new file mode 100644
index 00000000000..77810746c7f
--- /dev/null
+++ b/external/cub/experimental/Makefile
@@ -0,0 +1,125 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>]
+#
+#-------------------------------------------------------------------------------
+ 
+include ../common.mk 
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [mkl=<0|1>] compile against Intel MKL
+ifeq ($(mkl), 1)
+	DEFINES 	+= -DCUB_MKL
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	LIBS 		+=	mkl_intel_lp64.lib mkl_intel_thread.lib  mkl_core.lib libiomp5md.lib
+	NVCCFLAGS 	+= -Xcompiler /openmp
+else
+	LIBS		+= -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm
+	NVCCFLAGS 	+= -Xcompiler -fopenmp
+	
+endif	
+
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+# Includes
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+exp_rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+EXP_DEPS = 	$(call rwildcard, ./,*.cuh) \
+			$(call rwildcard, ./,*.h)
+
+DEPS =				$(CUB_DEPS) \
+					$(EXP_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+
+		
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+
+#-------------------------------------------------------------------------------
+# make histogram_compare
+#-------------------------------------------------------------------------------
+
+histogram_compare: bin/histogram_compare_$(BIN_SUFFIX)
+
+bin/histogram_compare_$(BIN_SUFFIX) : histogram_compare.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/histogram_compare_$(BIN_SUFFIX) histogram_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+
+
+#-------------------------------------------------------------------------------
+# make spmv_compare
+#-------------------------------------------------------------------------------
+
+spmv_compare: bin/spmv_compare_$(BIN_SUFFIX)
+
+bin/spmv_compare_$(BIN_SUFFIX) : spmv_compare.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/spmv_compare_$(BIN_SUFFIX) spmv_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse $(MKL_LIBS) -O3
+	
+
diff --git a/external/cub/experimental/defunct/example_coo_spmv.cu b/external/cub/experimental/defunct/example_coo_spmv.cu
new file mode 100644
index 00000000000..d60697d579c
--- /dev/null
+++ b/external/cub/experimental/defunct/example_coo_spmv.cu
@@ -0,0 +1,1070 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of COO SpMV using prefix scan to implement a
+ * reduce-value-by-row strategy
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "coo_graph.cuh"
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+typedef int         VertexId;   // uint32s as vertex ids
+typedef double      Value;      // double-precision floating point values
+
+bool                    g_verbose       = false;
+int                     g_timing_iterations    = 1;
+CachingDeviceAllocator  g_allocator;
+
+
+/******************************************************************************
+ * Texture referencing
+ ******************************************************************************/
+
+/**
+ * Templated texture reference type for multiplicand vector
+ */
+template <typename Value>
+struct TexVector
+{
+    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
+    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
+
+    // Texture reference type
+    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
+
+    static TexRef ref;
+
+    /**
+     * Bind textures
+     */
+    static void BindTexture(void *d_in, int elements)
+    {
+        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
+        if (d_in)
+        {
+            size_t offset;
+            size_t bytes = sizeof(CastType) * elements;
+            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
+        }
+    }
+
+    /**
+     * Unbind textures
+     */
+    static void UnbindTexture()
+    {
+        CubDebugExit(cudaUnbindTexture(ref));
+    }
+
+    /**
+     * Load
+     */
+    static __device__ __forceinline__ Value Load(int offset)
+    {
+        Value output;
+        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
+        return output;
+    }
+};
+
+// Texture reference definitions
+template <typename Value>
+typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
+
+
+/******************************************************************************
+ * Utility types
+ ******************************************************************************/
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id
+ */
+template <typename VertexId, typename Value>
+struct PartialProduct
+{
+    VertexId    row;            /// Row-id
+    Value       partial;        /// PartialProduct sum
+};
+
+
+/**
+ * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
+ */
+template <>
+struct PartialProduct<int, double>
+{
+    long long   row;            /// Row-id
+    double      partial;        /// PartialProduct sum
+};
+
+
+/**
+ * Reduce-value-by-row scan operator
+ */
+struct ReduceByKeyOp
+{
+    template <typename PartialProduct>
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &first,
+        const PartialProduct &second)
+    {
+        PartialProduct retval;
+
+        retval.partial = (second.row != first.row) ?
+                second.partial :
+                first.partial + second.partial;
+
+        retval.row = second.row;
+        return retval;
+    }
+};
+
+
+/**
+ * Stateful block-wide prefix operator for BlockScan
+ */
+template <typename PartialProduct>
+struct BlockPrefixCallbackOp
+{
+    // Running block-wide prefix
+    PartialProduct running_prefix;
+
+    /**
+     * Returns the block-wide running_prefix in thread-0
+     */
+    __device__ __forceinline__ PartialProduct operator()(
+        const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        ReduceByKeyOp scan_op;
+
+        PartialProduct retval = running_prefix;
+        running_prefix = scan_op(running_prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+/**
+ * Operator for detecting discontinuities in a list of row identifiers.
+ */
+struct NewRowOp
+{
+    /// Returns true if row_b is the start of a new row
+    template <typename VertexId>
+    __device__ __forceinline__ bool operator()(
+        const VertexId& row_a,
+        const VertexId& row_b)
+    {
+        return (row_a != row_b);
+    }
+};
+
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * SpMV thread block abstraction for processing a contiguous segment of
+ * sparse COO tiles.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct PersistentBlockSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
+    typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
+
+    // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
+    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        union
+        {
+            typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
+            typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
+            struct
+            {
+                typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
+                typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
+            };
+        };
+
+        VertexId        first_block_row;    ///< The first row-ID seen by this thread block
+        VertexId        last_block_row;     ///< The last row-ID seen by this thread block
+        Value           first_product;      ///< The first dot-product written by this thread block
+    };
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    VertexId                        *d_rows;
+    VertexId                        *d_columns;
+    Value                           *d_values;
+    Value                           *d_vector;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             block_offset;
+    int                             block_end;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    PersistentBlockSpmv(
+        TempStorage                 &temp_storage,
+        VertexId                    *d_rows,
+        VertexId                    *d_columns,
+        Value                       *d_values,
+        Value                       *d_vector,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         block_offset,
+        int                         block_end)
+    :
+        temp_storage(temp_storage),
+        d_rows(d_rows),
+        d_columns(d_columns),
+        d_values(d_values),
+        d_vector(d_vector),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        block_offset(block_offset),
+        block_end(block_end)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_rows[block_offset];
+            VertexId last_block_row             = d_rows[block_end - 1];
+
+            temp_storage.first_block_row        = first_block_row;
+            temp_storage.last_block_row         = last_block_row;
+            temp_storage.first_product          = Value(0);
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        columns[ITEMS_PER_THREAD];
+        VertexId        rows[ITEMS_PER_THREAD];
+        Value           values[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
+        if (FULL_TILE)
+        {
+            // Unguarded loads
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
+        }
+        else
+        {
+            // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
+            // vertex for out-of-bound items, but zero-valued
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
+            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
+        }
+
+        // Load the referenced values from x and compute the dot product partials sums
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+#if CUB_PTX_ARCH >= 350
+            values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
+#else
+            values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
+#endif
+        }
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
+
+        __syncthreads();
+
+        // Transpose from warp-striped to blocked arrangement
+        BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                     // (Out) Head flags
+            rows,                           // Original row ids
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Assemble partial product structures
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            partial_sums[ITEM].partial = values[ITEM];
+            partial_sums[ITEM].row = rows[ITEM];
+        }
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Barrier for smem reuse and coherence
+        __syncthreads();
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+
+                // Save off the first partial product that this thread block will scatter
+                if (partial_sums[ITEM].row == temp_storage.first_block_row)
+                {
+                    temp_storage.first_product = partial_sums[ITEM].partial;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        while (block_offset <= block_end - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process the last, partially-full tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        if (threadIdx.x == 0)
+        {
+            if (gridDim.x == 1)
+            {
+                // Scatter the final aggregate (this kernel contains only 1 thread block)
+                d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+            }
+            else
+            {
+                // Write the first and last partial products from this thread block so
+                // that they can be subsequently "fixed up" in the next kernel.
+
+                PartialProduct first_product;
+                first_product.row       = temp_storage.first_block_row;
+                first_product.partial   = temp_storage.first_product;
+
+                d_block_partials[blockIdx.x * 2]          = first_product;
+                d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
+            }
+        }
+    }
+};
+
+
+/**
+ * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD,
+    typename        VertexId,
+    typename        Value>
+struct FinalizeSpmvBlock
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Partial dot product type
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    // Parameterized BlockScan type for reduce-value-by-row scan
+    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
+
+    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
+    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
+
+    // Shared memory type for this thread block
+    struct TempStorage
+    {
+        typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
+        typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
+
+        VertexId last_block_row;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    TempStorage                     &temp_storage;
+    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
+    Value                           *d_result;
+    PartialProduct                  *d_block_partials;
+    int                             num_partials;
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    FinalizeSpmvBlock(
+        TempStorage                 &temp_storage,
+        Value                       *d_result,
+        PartialProduct              *d_block_partials,
+        int                         num_partials)
+    :
+        temp_storage(temp_storage),
+        d_result(d_result),
+        d_block_partials(d_block_partials),
+        num_partials(num_partials)
+    {
+        // Initialize scalar shared memory values
+        if (threadIdx.x == 0)
+        {
+            VertexId first_block_row            = d_block_partials[0].row;
+            VertexId last_block_row             = d_block_partials[num_partials - 1].row;
+            temp_storage.last_block_row         = last_block_row;
+
+            // Initialize prefix_op to identity
+            prefix_op.running_prefix.row        = first_block_row;
+            prefix_op.running_prefix.partial    = Value(0);
+        }
+
+        __syncthreads();
+    }
+
+
+    /**
+     * Processes a COO input tile of edges, outputting dot products for each row
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        int block_offset,
+        int guarded_items = 0)
+    {
+        VertexId        rows[ITEMS_PER_THREAD];
+        PartialProduct  partial_sums[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
+#endif
+        }
+        else
+        {
+            // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
+            PartialProduct default_sum;
+            default_sum.row = temp_storage.last_block_row;
+            default_sum.partial = Value(0);
+
+#if CUB_PTX_ARCH >= 350
+            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#else
+            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
+#endif
+        }
+
+        // Copy out row IDs for row-head flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            rows[ITEM] = partial_sums[ITEM].row;
+        }
+
+        // FlagT row heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            rows,                           // Original row ids
+            head_flags,                     // (Out) Head flags
+            NewRowOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
+
+        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
+        PartialProduct block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_sums,                   // Scan input
+            partial_sums,                   // Scan output
+            ReduceByKeyOp(),                // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated dot product if it is the head of a valid row
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessTiles()
+    {
+        // Process full tiles
+        int block_offset = 0;
+        while (block_offset <= num_partials - TILE_ITEMS)
+        {
+            ProcessTile<true>(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final partial tile (if present)
+        int guarded_items = num_partials - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, guarded_items);
+        }
+
+        // Scatter the final aggregate (this kernel contains only 1 thread block)
+        if (threadIdx.x == 0)
+        {
+            d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
+        }
+    }
+};
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+
+
+/**
+ * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS)
+__global__ void CooKernel(
+    GridEvenShare<int>              even_share,
+    PartialProduct<VertexId, Value> *d_block_partials,
+    VertexId                        *d_rows,
+    VertexId                        *d_columns,
+    Value                           *d_values,
+    Value                           *d_vector,
+    Value                           *d_result)
+{
+    // Specialize SpMV thread block abstraction type
+    typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
+
+    // Shared memory allocation
+    __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    PersistentBlockSpmv persistent_block(
+        temp_storage,
+        d_rows,
+        d_columns,
+        d_values,
+        d_vector,
+        d_result,
+        d_block_partials,
+        even_share.block_offset,
+        even_share.block_end);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+/**
+ * Kernel for "fixing up" an array of interblock SpMV partial products.
+ */
+template <
+    int                             BLOCK_THREADS,
+    int                             ITEMS_PER_THREAD,
+    typename                        VertexId,
+    typename                        Value>
+__launch_bounds__ (BLOCK_THREADS,  1)
+__global__ void CooFinalizeKernel(
+    PartialProduct<VertexId, Value> *d_block_partials,
+    int                             num_partials,
+    Value                           *d_result)
+{
+    // Specialize "fix-up" thread block abstraction type
+    typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
+
+    // Shared memory allocation
+    __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
+
+    // Process input tiles
+    persistent_block.ProcessTiles();
+}
+
+
+
+//---------------------------------------------------------------------
+// Host subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple test of device
+ */
+template <
+    int                         COO_BLOCK_THREADS,
+    int                         COO_ITEMS_PER_THREAD,
+    int                         COO_SUBSCRIPTION_FACTOR,
+    int                         FINALIZE_BLOCK_THREADS,
+    int                         FINALIZE_ITEMS_PER_THREAD,
+    typename                    VertexId,
+    typename                    Value>
+void TestDevice(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    typedef PartialProduct<VertexId, Value> PartialProduct;
+
+    const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
+
+    // SOA device storage
+    VertexId        *d_rows;             // SOA graph row coordinates
+    VertexId        *d_columns;          // SOA graph col coordinates
+    Value           *d_values;           // SOA graph values
+    Value           *d_vector;           // Vector multiplicand
+    Value           *d_result;           // Output row
+    PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
+
+    // Create SOA version of coo_graph on host
+    int             num_edges   = coo_graph.coo_tuples.size();
+    VertexId        *h_rows     = new VertexId[num_edges];
+    VertexId        *h_columns  = new VertexId[num_edges];
+    Value           *h_values   = new Value[num_edges];
+    for (int i = 0; i < num_edges; i++)
+    {
+        h_rows[i]       = coo_graph.coo_tuples[i].row;
+        h_columns[i]    = coo_graph.coo_tuples[i].col;
+        h_values[i]     = coo_graph.coo_tuples[i].val;
+    }
+
+    // Get CUDA properties
+    Device device_props;
+    CubDebugExit(device_props.Init());
+
+    // Determine launch configuration from kernel properties
+    int coo_sm_occupancy;
+    CubDebugExit(device_props.MaxSmOccupancy(
+        coo_sm_occupancy,
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
+        COO_BLOCK_THREADS));
+    int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
+
+    // Construct an even-share work distribution
+    GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
+    int coo_grid_size  = even_share.grid_size;
+    int num_partials   = coo_grid_size * 2;
+
+    // Allocate COO device arrays
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
+
+    // Copy host arrays to device
+    CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
+
+    // Bind textures
+    TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
+
+    // Print debug info
+    printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
+        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
+    if (coo_grid_size > 1)
+    {
+        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
+    }
+    fflush(stdout);
+
+    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
+
+    // Run kernel (always run one iteration without timing)
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0;
+    for (int i = 0; i <= g_timing_iterations; i++)
+    {
+        gpu_timer.Start();
+
+        // Initialize output
+        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
+
+        // Run the COO kernel
+        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
+            even_share,
+            d_block_partials,
+            d_rows,
+            d_columns,
+            d_values,
+            d_vector,
+            d_result);
+
+        if (coo_grid_size > 1)
+        {
+            // Run the COO finalize kernel
+            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
+                d_block_partials,
+                num_partials,
+                d_result);
+        }
+
+        gpu_timer.Stop();
+
+        if (i > 0)
+            elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Force any kernel stdio to screen
+    CubDebugExit(cudaThreadSynchronize());
+    fflush(stdout);
+
+    // Display timing
+    if (g_timing_iterations > 0)
+    {
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
+        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
+            g_timing_iterations,
+            avg_elapsed,
+            total_bytes / avg_elapsed / 1000.0 / 1000.0,
+            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
+    }
+
+    // Check results
+    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    TexVector<Value>::UnbindTexture();
+    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
+    CubDebugExit(g_allocator.DeviceFree(d_rows));
+    CubDebugExit(g_allocator.DeviceFree(d_columns));
+    CubDebugExit(g_allocator.DeviceFree(d_values));
+    CubDebugExit(g_allocator.DeviceFree(d_vector));
+    CubDebugExit(g_allocator.DeviceFree(d_result));
+    delete[] h_rows;
+    delete[] h_columns;
+    delete[] h_values;
+}
+
+
+/**
+ * Compute reference answer on CPU
+ */
+template <typename VertexId, typename Value>
+void ComputeReference(
+    CooGraph<VertexId, Value>&  coo_graph,
+    Value*                      h_vector,
+    Value*                      h_reference)
+{
+    for (VertexId i = 0; i < coo_graph.row_dim; i++)
+    {
+        h_reference[i] = 0.0;
+    }
+
+    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
+    {
+        h_reference[coo_graph.coo_tuples[i].row] +=
+            coo_graph.coo_tuples[i].val *
+            h_vector[coo_graph.coo_tuples[i].col];
+    }
+}
+
+
+/**
+ * Assign arbitrary values to vector items
+ */
+template <typename Value>
+void AssignVectorValues(Value *vector, int col_dim)
+{
+    for (int i = 0; i < col_dim; i++)
+    {
+        vector[i] = 1.0;
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
+            "\t--type=wheel --spokes=<spokes>\n"
+            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
+            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
+            "\t--type=market --file=<file>\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get graph type
+    string type;
+    args.GetCmdLineArgument("type", type);
+
+    // Generate graph structure
+
+    CpuTimer timer;
+    timer.Start();
+    CooGraph<VertexId, Value> coo_graph;
+    if (type == string("grid2d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
+    } else if (type == string("grid3d"))
+    {
+        VertexId width;
+        args.GetCmdLineArgument("width", width);
+        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
+        printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
+        if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
+    }
+    else if (type == string("wheel"))
+    {
+        VertexId spokes;
+        args.GetCmdLineArgument("spokes", spokes);
+        printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
+        if (coo_graph.InitWheel(spokes)) exit(1);
+    }
+    else if (type == string("market"))
+    {
+        string filename;
+        args.GetCmdLineArgument("file", filename);
+        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
+        if (coo_graph.InitMarket(filename)) exit(1);
+    }
+    else
+    {
+        printf("Unsupported graph type\n");
+        exit(1);
+    }
+    timer.Stop();
+    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
+        timer.ElapsedMillis() / 1000.0,
+        coo_graph.coo_tuples.size(),
+        coo_graph.row_dim,
+        coo_graph.col_dim);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        cout << coo_graph << "\n";
+    }
+
+    // Create vector
+    Value *h_vector = new Value[coo_graph.col_dim];
+    AssignVectorValues(h_vector, coo_graph.col_dim);
+    if (g_verbose)
+    {
+        printf("Vector[%d]: ", coo_graph.col_dim);
+        DisplayResults(h_vector, coo_graph.col_dim);
+        printf("\n\n");
+    }
+
+    // Compute reference answer
+    Value *h_reference = new Value[coo_graph.row_dim];
+    ComputeReference(coo_graph, h_vector, h_reference);
+    if (g_verbose)
+    {
+        printf("Results[%d]: ", coo_graph.row_dim);
+        DisplayResults(h_reference, coo_graph.row_dim);
+        printf("\n\n");
+    }
+
+    // Parameterization for SM35
+    enum
+    {
+        COO_BLOCK_THREADS           = 64,
+        COO_ITEMS_PER_THREAD        = 10,
+        COO_SUBSCRIPTION_FACTOR     = 4,
+        FINALIZE_BLOCK_THREADS      = 256,
+        FINALIZE_ITEMS_PER_THREAD   = 4,
+    };
+
+    // Run GPU version
+    TestDevice<
+        COO_BLOCK_THREADS,
+        COO_ITEMS_PER_THREAD,
+        COO_SUBSCRIPTION_FACTOR,
+        FINALIZE_BLOCK_THREADS,
+        FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
+
+    // Cleanup
+    delete[] h_vector;
+    delete[] h_reference;
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/experimental/defunct/test_device_seg_reduce.cu b/external/cub/experimental/defunct/test_device_seg_reduce.cu
new file mode 100644
index 00000000000..20ef4764389
--- /dev/null
+++ b/external/cub/experimental/defunct/test_device_seg_reduce.cu
@@ -0,0 +1,2142 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * An implementation of segmented reduction using a load-balanced parallelization
+ * strategy based on the MergePath decision path.
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <stdio.h>
+
+#include <cub/cub.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+/******************************************************************************
+ * Globals, constants, and typedefs
+ ******************************************************************************/
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 1;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/******************************************************************************
+ * Utility routines
+ ******************************************************************************/
+
+
+/**
+ * An pair of index offsets
+ */
+template <typename OffsetT>
+struct IndexPair
+{
+    OffsetT a_idx;
+    OffsetT b_idx;
+};
+
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void ParallelMergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (a_split_min < a_split_max)
+    {
+        OffsetT a_distance       = a_split_max - a_split_min;
+        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
+        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
+
+        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
+        int num_up = __syncthreads_count(move_up);
+/*
+        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
+            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
+*/
+        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
+        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
+    }
+
+    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
+}
+
+/**
+ * Computes the begin offsets into A and B for the specified
+ * location (diagonal) along the merge decision path
+ */
+template <
+    typename            IteratorA,
+    typename            IteratorB,
+    typename            OffsetT>
+__device__ __forceinline__ void MergePathSearch(
+    OffsetT             diagonal,
+    IteratorA           a,
+    IteratorB           b,
+    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
+    IndexPair<OffsetT>  end,            // End offsets into a and b
+    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
+{
+    OffsetT split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
+    OffsetT split_max = CUB_MIN(diagonal, end.a_idx);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    intersection.a_idx = CUB_MIN(split_min, end.a_idx);
+    intersection.b_idx = CUB_MIN(diagonal - split_min, end.b_idx);
+}
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegion
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    bool                    _USE_SMEM_SEGMENT_CACHE,    ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+    bool                    _USE_SMEM_VALUE_CACHE,      ///< Whether or not to cache incoming values in shared memory before reducing each tile
+    CacheLoadModifier       _LOAD_MODIFIER_SEGMENTS,    ///< Cache load modifier for reading segment offsets
+    CacheLoadModifier       _LOAD_MODIFIER_VALUES,      ///< Cache load modifier for reading values
+    BlockReduceAlgorithm    _REDUCE_ALGORITHM,          ///< The BlockReduce algorithm to use
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        USE_SMEM_SEGMENT_CACHE  = _USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = _USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+    };
+
+    static const CacheLoadModifier      LOAD_MODIFIER_SEGMENTS  = _LOAD_MODIFIER_SEGMENTS;  ///< Cache load modifier for reading segment offsets
+    static const CacheLoadModifier      LOAD_MODIFIER_VALUES    = _LOAD_MODIFIER_VALUES;    ///< Cache load modifier for reading values
+    static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< The BlockReduce algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide segmented reduction.
+ */
+template <
+    typename BlockSegReduceRegionPolicy,    ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,         ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                 ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,               ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct BlockSegReduceRegion
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,                     /// Number of work items to be processed per tile
+
+        USE_SMEM_SEGMENT_CACHE  = BlockSegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+        USE_SMEM_VALUE_CACHE    = BlockSegReduceRegionPolicy::USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
+
+        SMEM_SEGMENT_CACHE_ITEMS    = USE_SMEM_SEGMENT_CACHE ? TILE_ITEMS : 1,
+        SMEM_VALUE_CACHE_ITEMS      = USE_SMEM_VALUE_CACHE ? TILE_ITEMS : 1,
+    };
+
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Segment offsets iterator wrapper type
+    typedef typename If<(IsPointer<SegmentOffsetIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS, SegmentOffsetT, OffsetT>,  // Wrap the native input pointer with CacheModifiedInputIterator
+            SegmentOffsetIterator>::Type                                                                            // Directly use the supplied input iterator type
+        WrappedSegmentOffsetIterator;
+
+    // Values iterator wrapper type
+    typedef typename If<(IsPointer<ValueIterator>::VALUE),
+            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_VALUES, Value, OffsetT>,        // Wrap the native input pointer with CacheModifiedInputIterator
+            ValueIterator>::Type                                                                                // Directly use the supplied input iterator type
+        WrappedValueIterator;
+
+    // Tail flag type for marking segment discontinuities
+    typedef int TailFlag;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT> IndexPair;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockShift type for exchanging index pairs
+    typedef BlockShift<
+            IndexPair,
+            BLOCK_THREADS>
+        BlockShift;
+
+    // Parameterized BlockReduce type for block-wide reduction
+    typedef BlockReduce<
+            Value,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::REDUCE_ALGORITHM>
+        BlockReduce;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            // Smem needed for BlockScan
+            typename BlockScan::TempStorage scan;
+
+            // Smem needed for BlockReduce
+            typename BlockReduce::TempStorage reduce;
+
+            struct
+            {
+                // Smem needed for communicating start/end indices between threads for a given work tile
+                typename BlockShift::TempStorage shift;
+
+                // Smem needed for caching segment end-offsets
+                SegmentOffset cached_segment_end_offsets[SMEM_SEGMENT_CACHE_ITEMS + 1];
+            };
+
+            // Smem needed for caching values
+            Value cached_values[SMEM_VALUE_CACHE_ITEMS];
+        };
+
+        IndexPair block_region_idx[2];      // The starting [0] and ending [1] pairs of segment and value indices for the thread block's region
+
+        // The first partial reduction tuple scattered by this thread block
+        KeyValuePair first_tuple;
+    };
+
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                    &temp_storage;          ///< Reference to shared storage
+    WrappedSegmentOffsetIterator    d_segment_end_offsets;  ///< A sequence of \p num_segments segment end-offsets
+    WrappedValueIterator            d_values;               ///< A sequence of \p num_values data to reduce
+    OutputIteratorT                  d_output;               ///< A sequence of \p num_segments segment totals
+    CountingIterator                d_value_offsets;        ///< A sequence of \p num_values value-offsets
+    IndexPair                       *d_block_idx;
+    OffsetT                         num_values;             ///< Total number of values to reduce
+    OffsetT                         num_segments;           ///< Number of segments being reduced
+    Value                           identity;               ///< Identity value (for zero-length segments)
+    ReductionOp                     reduction_op;           ///< Reduction operator
+    ReduceByKeyOp                   scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp         prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegion(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        SegmentOffsetIterator   d_segment_end_offsets,  ///< A sequence of \p num_segments segment end-offsets
+        ValueIterator           d_values,               ///< A sequence of \p num_values values
+        OutputIteratorT          d_output,               ///< A sequence of \p num_segments segment totals
+        IndexPair               *d_block_idx,
+        OffsetT                 num_values,             ///< Number of values to reduce
+        OffsetT                 num_segments,           ///< Number of segments being reduced
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_segment_end_offsets(d_segment_end_offsets),
+        d_values(d_values),
+        d_value_offsets(0),
+        d_output(d_output),
+        d_block_idx(d_block_idx),
+        num_values(num_values),
+        num_segments(num_segments),
+        identity(identity),
+        reduction_op(reduction_op),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+    /**
+     * Fast-path single-segment tile reduction.  Perform a
+     * simple block-wide reduction and accumulate the result into
+     * the running total.
+     */
+    __device__ __forceinline__ void SingleSegmentTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+        // Load a tile's worth of values (using identity for out-of-bounds items)
+        Value values[ITEMS_PER_THREAD];
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Reduce the tile of values and update the running total in thread-0
+        KeyValuePair tile_aggregate;
+        tile_aggregate.key      = block_idx.a_idx;
+        tile_aggregate.value    = BlockReduce(temp_storage.reduce).Reduce(values, reduction_op);
+
+        if (threadIdx.x == 0)
+        {
+            prefix_op.running_total = scan_op(prefix_op.running_total, tile_aggregate);
+        }
+    }
+
+    /**
+     * Fast-path empty-segment tile reduction.  Write out a tile of identity
+     * values to output.
+     */
+    __device__ __forceinline__ void EmptySegmentsTile(
+        IndexPair next_tile_idx,
+        IndexPair block_idx)
+    {
+        Value segment_reductions[ITEMS_PER_THREAD];
+
+        if (threadIdx.x == 0)
+        {
+            // The first segment gets the running segment total
+            segment_reductions[0] = prefix_op.running_total.value;
+
+            // Update the running prefix
+            prefix_op.running_total.value = identity;
+            prefix_op.running_total.key = next_tile_idx.a_idx;
+        }
+        else
+        {
+            // Remainder of segments in this tile get identity
+            segment_reductions[0] = identity;
+        }
+
+        // Remainder of segments in this tile get identity
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            segment_reductions[ITEM] = identity;
+
+        // Store reductions
+        OffsetT tile_segments = next_tile_idx.a_idx - block_idx.a_idx;
+        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_output + block_idx.a_idx, segment_reductions, tile_segments);
+    }
+
+
+    /**
+     * Multi-segment tile reduction.
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void MultiSegmentTile(
+        IndexPair block_idx,
+        IndexPair thread_idx,
+        IndexPair next_thread_idx,
+        IndexPair next_tile_idx)
+    {
+        IndexPair local_thread_idx;
+        local_thread_idx.a_idx = thread_idx.a_idx - block_idx.a_idx;
+        local_thread_idx.b_idx = thread_idx.b_idx - block_idx.b_idx;
+
+        // Check if first segment end-offset is in range
+        bool valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            (USE_SMEM_SEGMENT_CACHE)?
+                temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx] :
+                d_segment_end_offsets[thread_idx.a_idx] :
+            -1;
+
+        OffsetT segment_ids[ITEMS_PER_THREAD];
+        OffsetT value_offsets[ITEMS_PER_THREAD];
+
+        KeyValuePair first_partial;
+        first_partial.key    = thread_idx.a_idx;
+        first_partial.value  = identity;
+
+        // Get segment IDs and gather-offsets for values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            segment_ids[ITEM]   = -1;
+            value_offsets[ITEM] = -1;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= thread_idx.b_idx)))
+            {
+                // Consume this segment index
+                segment_ids[ITEM] = thread_idx.a_idx;
+                thread_idx.a_idx++;
+                local_thread_idx.a_idx++;
+
+                valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
+
+                // Read next segment end-offset (if valid)
+                if (valid_segment)
+                {
+                    if (USE_SMEM_SEGMENT_CACHE)
+                        segment_end_offset = temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx];
+                    else
+                        segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+                }
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+                value_offsets[ITEM] = thread_idx.b_idx;
+                thread_idx.b_idx++;
+                local_thread_idx.b_idx++;
+
+                valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
+            }
+        }
+
+        // Load values
+        Value values[ITEMS_PER_THREAD];
+
+        if (USE_SMEM_VALUE_CACHE)
+        {
+            // Barrier for smem reuse
+            __syncthreads();
+
+            OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
+
+            // Load a tile's worth of values (using identity for out-of-bounds items)
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
+
+            // Store to shared
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_values, values, tile_values);
+
+            // Barrier for smem reuse
+            __syncthreads();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    temp_storage.cached_values[value_offsets[ITEM] - block_idx.b_idx];
+            }
+        }
+        else
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                values[ITEM] = (value_offsets[ITEM] == -1) ?
+                    identity :
+                    d_values[value_offsets[ITEM]];
+            }
+        }
+
+        // Reduce within thread segments
+        KeyValuePair running_total = first_partial;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_ids[ITEM] != -1)
+            {
+                // Consume this segment index
+                d_output[segment_ids[ITEM]] = running_total.value;
+
+//                _CubLog("Updating segment %d with value %lld\n", segment_ids[ITEM], running_total.value)
+
+                if (first_partial.key == segment_ids[ITEM])
+                    first_partial.value = running_total.value;
+
+                running_total.key    = segment_ids[ITEM];
+                running_total.value  = identity;
+            }
+
+            running_total.value = reduction_op(running_total.value, values[ITEM]);
+        }
+/*
+
+        // Barrier for smem reuse
+        __syncthreads();
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            pairs,                          // Scan input
+            pairs,                          // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+*/
+
+/*
+        // Check if first segment end-offset is in range
+        bool valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx);
+
+        // Check if first value offset is in range
+        bool valid_value = (thread_idx.b_idx < next_thread_idx.b_idx);
+
+        // Load first segment end-offset
+        OffsetT segment_end_offset = (valid_segment) ?
+            d_segment_end_offsets[thread_idx.a_idx] :
+            num_values;                                                     // Out of range (the last segment end-offset is one-past the last value offset)
+
+        // Load first value offset
+        OffsetT value_offset = (valid_value) ?
+            d_value_offsets[thread_idx.b_idx] :
+            num_values;                                                     // Out of range (one-past the last value offset)
+
+        // Assemble segment-demarcating tail flags and partial reduction tuples
+        TailFlag        tail_flags[ITEMS_PER_THREAD];
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Default tuple and flag values
+            partial_reductions[ITEM].key    = thread_idx.a_idx;
+            partial_reductions[ITEM].value  = identity;
+            tail_flags[ITEM]                = 0;
+
+            // Whether or not we slide (a) right along the segment path or (b) down the value path
+            if (valid_segment && (!valid_value || (segment_end_offset <= value_offset)))
+            {
+                // Consume this segment index
+
+                // Set tail flag noting the end of the segment
+                tail_flags[ITEM] = 1;
+
+                // Increment segment index
+                thread_idx.a_idx++;
+
+                // Read next segment end-offset (if valid)
+                if ((valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx)))
+                    segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
+            }
+            else if (valid_value)
+            {
+                // Consume this value index
+
+                // Update the tuple's value with the value at this index.
+                partial_reductions[ITEM].value = d_values[value_offset];
+
+                // Increment value index
+                thread_idx.b_idx++;
+
+                // Read next value offset (if valid)
+                if ((valid_value = (thread_idx.b_idx < next_thread_idx.b_idx)))
+                    value_offset = d_value_offsets[thread_idx.b_idx];
+            }
+        }
+
+        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).InclusiveScan(
+            partial_reductions,             // Scan input
+            partial_reductions,             // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // The first segment index for this region (hoist?)
+        OffsetT first_segment_idx = temp_storage.block_idx.a_idx[0];
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (tail_flags[ITEM])
+            {
+                OffsetT segment_idx = partial_reductions[ITEM].key;
+                Value   value       = partial_reductions[ITEM].value;
+
+                // Write value reduction to corresponding segment id
+                d_output[segment_idx] = value;
+
+                // Save off the first value product that this thread block will scatter
+                if (segment_idx == first_segment_idx)
+                {
+                    temp_storage.first_tuple.value = value;
+                }
+            }
+        }
+*/
+    }
+
+
+
+    /**
+     * Have the thread block process the specified region of the MergePath decision path
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT         block_diagonal,
+        OffsetT         next_block_diagonal,
+        KeyValuePair    &first_tuple,       // [Out] Valid in thread-0
+        KeyValuePair    &last_tuple)        // [Out] Valid in thread-0
+    {
+        // Thread block initialization
+        if (threadIdx.x < 2)
+        {
+            // Retrieve block starting and ending indices
+            IndexPair block_idx = {0, 0};
+            if (gridDim.x > 1)
+            {
+                block_idx = d_block_idx[blockIdx.x + threadIdx.x];
+            }
+            else if (threadIdx.x > 0)
+            {
+                block_idx.a_idx = num_segments;
+                block_idx.b_idx = num_values;
+            }
+
+            // Share block starting and ending indices
+            temp_storage.block_region_idx[threadIdx.x] = block_idx;
+
+            // Initialize the block's running prefix
+            if (threadIdx.x == 0)
+            {
+                prefix_op.running_total.key    = block_idx.a_idx;
+                prefix_op.running_total.value  = identity;
+
+                // Initialize the "first scattered partial reduction tuple" to the prefix tuple (in case we don't actually scatter one)
+                temp_storage.first_tuple = prefix_op.running_total;
+            }
+        }
+
+        // Ensure coherence of region indices
+        __syncthreads();
+
+        // Read block's starting indices
+        IndexPair block_idx = temp_storage.block_region_idx[0];
+
+        // Have the thread block iterate over the region
+        #pragma unroll 1
+        while (block_diagonal < next_block_diagonal)
+        {
+            // Read block's ending indices (hoist?)
+            IndexPair next_block_idx = temp_storage.block_region_idx[1];
+
+            // Clamp the per-thread search range to within one work-tile of block's current indices
+            IndexPair next_tile_idx;
+            next_tile_idx.a_idx = CUB_MIN(next_block_idx.a_idx, block_idx.a_idx + TILE_ITEMS);
+            next_tile_idx.b_idx = CUB_MIN(next_block_idx.b_idx, block_idx.b_idx + TILE_ITEMS);
+
+            // Have each thread search for the end-indices of its subranges within the segment and value inputs
+            IndexPair next_thread_idx;
+            if (USE_SMEM_SEGMENT_CACHE)
+            {
+                // Search in smem cache
+                OffsetT num_segments = next_tile_idx.a_idx - block_idx.a_idx;
+
+                // Load global
+                SegmentOffset segment_offsets[ITEMS_PER_THREAD];
+                LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_segment_end_offsets + block_idx.a_idx, segment_offsets, num_segments, num_values);
+
+                // Store to shared
+                StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_segment_end_offsets, segment_offsets);
+
+                __syncthreads();
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    temp_storage.cached_segment_end_offsets - block_idx.a_idx,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+            else
+            {
+                // Search in global
+
+                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
+
+                MergePathSearch(
+                    next_thread_diagonal,                       // Next thread diagonal
+                    d_segment_end_offsets,                      // A (segment end-offsets)
+                    d_value_offsets,                            // B (value offsets)
+                    block_idx,                                  // Start indices into A and B
+                    next_tile_idx,                              // End indices into A and B
+                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
+            }
+
+            // Share thread end-indices to get thread begin-indices and tile end-indices
+            IndexPair thread_idx;
+
+            BlockShift(temp_storage.shift).Up(
+                next_thread_idx,    // Input item
+                thread_idx,         // [out] Output item
+                block_idx,          // Prefix item to be provided to <em>thread</em><sub>0</sub>
+                next_tile_idx);     // [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
+
+//            if (block_idx.a_idx == next_tile_idx.a_idx)
+//            {
+//                // There are no segment end-offsets in this tile.  Perform a
+//                // simple block-wide reduction and accumulate the result into
+//                // the running total.
+//                SingleSegmentTile(next_tile_idx, block_idx);
+//            }
+//          else if (block_idx.b_idx == next_tile_idx.b_idx)
+//            {
+//                // There are no values in this tile (only empty segments).
+//                EmptySegmentsTile(next_tile_idx.a_idx, block_idx.a_idx);
+//            }
+//            else
+            if ((next_tile_idx.a_idx < num_segments) && (next_tile_idx.b_idx < num_values))
+            {
+                // Merge the tile's segment and value indices (full tile)
+                MultiSegmentTile<true>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+            else
+            {
+                // Merge the tile's segment and value indices (partially full tile)
+                MultiSegmentTile<false>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
+            }
+
+            // Advance the block's indices in preparation for the next tile
+            block_idx = next_tile_idx;
+
+            // Advance to the next region in the decision path
+            block_diagonal += TILE_ITEMS;
+
+            // Barrier for smem reuse
+            __syncthreads();
+        }
+
+        // Get first and last tuples for the region
+        if (threadIdx.x == 0)
+        {
+            first_tuple = temp_storage.first_tuple;
+            last_tuple = prefix_op.running_total;
+        }
+
+    }
+
+
+};
+
+
+
+
+
+
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for BlockSegReduceRegionByKey
+ */
+template <
+    int                     _BLOCK_THREADS,             ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm      _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
+    bool                    _LOAD_WARP_TIME_SLICING,    ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+    CacheLoadModifier       _LOAD_MODIFIER,             ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
+struct BlockSegReduceRegionByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)    };
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Persistent thread block types
+ ******************************************************************************/
+
+/**
+ * \brief BlockSegReduceRegionByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp>                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct BlockSegReduceRegionByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = BlockSegReduceRegionByKeyPolicy::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // KeyValuePair input type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type KeyValuePair;
+
+    // Signed integer type for global offsets
+    typedef typename KeyValuePair::Key OffsetT;
+
+    // Value type
+    typedef typename KeyValuePair::Value Value;
+
+    // Head flag type
+    typedef int HeadFlag;
+
+    // Input iterator wrapper type for loading KeyValuePair elements through cache
+    typedef CacheModifiedInputIterator<
+            BlockSegReduceRegionByKeyPolicy::LOAD_MODIFIER,
+            KeyValuePair,
+            OffsetT>
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            WrappedInputIteratorT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            BlockSegReduceRegionByKeyPolicy::LOAD_ALGORITHM,
+            BlockSegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING>
+        BlockLoad;
+
+    // BlockScan scan operator for reduction-by-segment
+    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef RunningBlockPrefixCallbackOp<
+            KeyValuePair,
+            ReduceByKeyOp>
+        RunningPrefixCallbackOp;
+
+    // Parameterized BlockScan type for block-wide reduce-value-by-key
+    typedef BlockScan<
+            KeyValuePair,
+            BLOCK_THREADS,
+            BlockSegReduceRegionByKeyPolicy::SCAN_ALGORITHM>
+        BlockScan;
+
+    // Parameterized BlockDiscontinuity type for identifying key discontinuities
+    typedef BlockDiscontinuity<
+            OffsetT,
+            BLOCK_THREADS>
+        BlockDiscontinuity;
+
+    // Operator for detecting discontinuities in a list of segment identifiers.
+    struct NewSegmentOp
+    {
+        /// Returns true if row_b is the start of a new row
+        __device__ __forceinline__ bool operator()(const OffsetT& b, const OffsetT& a)
+        {
+            return (a != b);
+        }
+    };
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        union
+        {
+            typename BlockLoad::TempStorage                 load;           // Smem needed for tile loading
+            struct {
+                typename BlockScan::TempStorage             scan;           // Smem needed for reduce-value-by-segment scan
+                typename BlockDiscontinuity::TempStorage    discontinuity;  // Smem needed for head-flagging
+            };
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage                &temp_storage;          ///< Reference to shared storage
+    WrappedInputIteratorT       d_tuple_partials;       ///< A sequence of partial reduction tuples to scan
+    OutputIteratorT              d_output;               ///< A sequence of segment totals
+    Value                       identity;               ///< Identity value (for zero-length segments)
+    ReduceByKeyOp               scan_op;                ///< Reduce-by-key scan operator
+    RunningPrefixCallbackOp     prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
+
+
+    //---------------------------------------------------------------------
+    // Operations
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__
+    BlockSegReduceRegionByKey(
+        TempStorage             &temp_storage,          ///< Reference to shared storage
+        InputIteratorT          d_tuple_partials,       ///< A sequence of partial reduction tuples to scan
+        OutputIteratorT          d_output,               ///< A sequence of segment totals
+        Value                   identity,               ///< Identity value (for zero-length segments)
+        ReductionOp             reduction_op)           ///< Reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_tuple_partials(d_tuple_partials),
+        d_output(d_output),
+        identity(identity),
+        scan_op(reduction_op),
+        prefix_op(scan_op)
+    {}
+
+
+
+    /**
+     * Processes a reduce-value-by-key input tile, outputting reductions for each segment
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__
+    void ProcessTile(
+        OffsetT block_offset,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx,
+        int guarded_items = TILE_ITEMS)
+    {
+        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
+        OffsetT         segment_ids[ITEMS_PER_THREAD];
+        HeadFlag        head_flags[ITEMS_PER_THREAD];
+
+        // Load a tile of block partials from previous kernel
+        if (FULL_TILE)
+        {
+            // Full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions);
+        }
+        else
+        {
+            KeyValuePair oob_default;
+            oob_default.key    = last_segment_idx;       // The last segment ID to be reduced
+            oob_default.value  = identity;
+
+            // Partially-full tile
+            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions, guarded_items, oob_default);
+        }
+
+        // Barrier for shared memory reuse
+        __syncthreads();
+
+        // Copy the segment IDs for head-flagging
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            segment_ids[ITEM] = partial_reductions[ITEM].key;
+        }
+
+        // FlagT segment heads by looking for discontinuities
+        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
+            head_flags,                         // [out] Head flags
+            segment_ids,                        // Segment ids
+            NewSegmentOp(),                     // Functor for detecting start of new rows
+            prefix_op.running_total.key);       // Last segment ID from previous tile to compare with first segment ID in this tile
+
+        // Reduce-value-by-segment across partial_reductions using exclusive prefix scan
+        KeyValuePair block_aggregate;
+        BlockScan(temp_storage.scan).ExclusiveScan(
+            partial_reductions,                   // Scan input
+            partial_reductions,                   // Scan output
+            scan_op,                        // Scan operator
+            block_aggregate,                // Block-wide total (unused)
+            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
+
+        // Scatter an accumulated reduction if it is the head of a valid segment
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (head_flags[ITEM])
+            {
+                d_output[partial_reductions[ITEM].key] = partial_reductions[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * Iterate over input tiles belonging to this thread block
+     */
+    __device__ __forceinline__
+    void ProcessRegion(
+        OffsetT block_offset,
+        OffsetT block_end,
+        OffsetT first_segment_idx,
+        OffsetT last_segment_idx)
+    {
+        if (threadIdx.x == 0)
+        {
+            // Initialize running prefix to the first segment index paired with identity
+            prefix_op.running_total.key    = first_segment_idx;
+            prefix_op.running_total.value  = identity;
+        }
+
+        // Process full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessTile<true>(block_offset, first_segment_idx, last_segment_idx);
+            __syncthreads();
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process final value tile (if present)
+        int guarded_items = block_end - block_offset;
+        if (guarded_items)
+        {
+            ProcessTile<false>(block_offset, first_segment_idx, last_segment_idx, guarded_items);
+        }
+    }
+};
+
+
+
+/******************************************************************************
+ * Kernel entrypoints
+ ******************************************************************************/
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+
+template <
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OffsetT>                           ///< Signed integer type for global offsets
+__global__ void SegReducePartitionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    IndexPair<OffsetT>          *d_block_idx,
+    int                         num_partition_samples,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    // Segment offset type
+    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
+
+    // Counting iterator type
+    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
+
+    // Cache-modified iterator for segment end-offsets
+    CacheModifiedInputIterator<LOAD_LDG, SegmentOffsetT, OffsetT> d_wrapped_segment_end_offsets(d_segment_end_offsets);
+
+    // Counting iterator for value offsets
+    CountingIterator d_value_offsets(0);
+
+    // Initialize even-share to tell us where to start and stop our tile-processing
+    int partition_id = (blockDim.x * blockIdx.x) + threadIdx.x;
+    even_share.Init(partition_id);
+
+    // Search for block starting and ending indices
+    IndexPair<OffsetT> start_idx = {0, 0};
+    IndexPair<OffsetT> end_idx   = {num_segments, num_values};
+    IndexPair<OffsetT> block_idx;
+
+    MergePathSearch(
+        even_share.block_offset,            // Next thread diagonal
+        d_wrapped_segment_end_offsets,      // A (segment end-offsets)
+        d_value_offsets,                    // B (value offsets)
+        start_idx,                          // Start indices into A and B
+        end_idx,                            // End indices into A and B
+        block_idx);                         // [out] diagonal intersection indices into A and B
+
+    // Write output
+    if (partition_id < num_partition_samples)
+    {
+        d_block_idx[partition_id] = block_idx;
+    }
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (multi-block).
+ */
+template <
+    typename BlockSegReduceRegionPolicy,        ///< Parameterized BlockSegReduceRegionPolicy tuning policy
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT,                           ///< Signed integer type for global offsets
+    typename Value>                             ///< Value type
+__launch_bounds__ (BlockSegReduceRegionPolicy::BLOCK_THREADS)
+__global__ void SegReduceRegionKernel(
+    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+    ValueIterator               d_values,               ///< [in] A sequence of \p num_values values
+    OutputIteratorT              d_output,               ///< [out] A sequence of \p num_segments segment totals
+    KeyValuePair<OffsetT, Value> *d_tuple_partials,      ///< [out] A sequence of (gridDim.x * 2) partial reduction tuples
+    IndexPair<OffsetT>          *d_block_idx,
+    OffsetT                     num_values,             ///< [in] Number of values to reduce
+    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
+    Value                       identity,               ///< [in] Identity value (for zero-length segments)
+    ReductionOp                 reduction_op,           ///< [in] Reduction operator
+    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+{
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Specialize thread block abstraction type for reducing a range of segmented values
+    typedef BlockSegReduceRegion<
+            BlockSegReduceRegionPolicy,
+            SegmentOffsetIterator,
+            ValueIterator,
+            OutputIteratorT,
+            ReductionOp,
+            OffsetT>
+        BlockSegReduceRegion;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegion::TempStorage temp_storage;
+
+    // Initialize thread block even-share to tell us where to start and stop our tile-processing
+    even_share.BlockInit();
+
+    // Construct persistent thread block
+    BlockSegReduceRegion thread_block(
+        temp_storage,
+        d_segment_end_offsets,
+        d_values,
+        d_output,
+        d_block_idx,
+        num_values,
+        num_segments,
+        identity,
+        reduction_op);
+
+    // First and last partial reduction tuples within the range (valid in thread-0)
+    KeyValuePair first_tuple, last_tuple;
+
+    // Consume block's region of work
+    thread_block.ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end,
+        first_tuple,
+        last_tuple);
+
+    if (threadIdx.x == 0)
+    {
+        if (gridDim.x > 1)
+        {
+            // Special case where the first segment written and the carry-out are for the same segment
+            if (first_tuple.key == last_tuple.key)
+            {
+                first_tuple.value = identity;
+            }
+
+            // Write the first and last partial products from this thread block so
+            // that they can be subsequently "fixed up" in the next kernel.
+            d_tuple_partials[blockIdx.x * 2]          = first_tuple;
+            d_tuple_partials[(blockIdx.x * 2) + 1]    = last_tuple;
+        }
+    }
+
+}
+
+
+/**
+ * Segmented reduce region kernel entry point (single-block).
+ */
+template <
+    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
+    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
+    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
+    typename    ReductionOp,                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename    OffsetT,                                ///< Signed integer type for global offsets
+    typename    Value>                                  ///< Value type
+__launch_bounds__ (BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS, 1)
+__global__ void SegReduceRegionByKeyKernel(
+    InputIteratorT          d_tuple_partials,           ///< [in] A sequence of partial reduction tuples
+    OutputIteratorT          d_output,                   ///< [out] A sequence of \p num_segments segment totals
+    OffsetT                 num_segments,               ///< [in] Number of segments in the \p d_output sequence
+    int                     num_tuple_partials,         ///< [in] Number of partial reduction tuples being reduced
+    Value                   identity,                   ///< [in] Identity value (for zero-length segments)
+    ReductionOp             reduction_op)               ///< [in] Reduction operator
+{
+    // Specialize thread block abstraction type for reducing a range of values by key
+    typedef BlockSegReduceRegionByKey<
+            BlockSegReduceRegionByKeyPolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            ReductionOp>
+        BlockSegReduceRegionByKey;
+
+    // Shared memory allocation
+    __shared__ typename BlockSegReduceRegionByKey::TempStorage temp_storage;
+
+    // Construct persistent thread block
+    BlockSegReduceRegionByKey thread_block(
+        temp_storage,
+        d_tuple_partials,
+        d_output,
+        identity,
+        reduction_op);
+
+    // Process input tiles
+    thread_block.ProcessRegion(
+        0,                          // Region start
+        num_tuple_partials,         // Region end
+        0,                          // First segment ID
+        num_segments);              // Last segment ID (one-past)
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
+ */
+template <
+    typename ValueIterator,                     ///< Random-access input iterator type for reading values
+    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
+    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
+    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct DeviceSegReduceDispatch
+{
+    // Value type
+    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+    // Reduce-by-key data type tuple (segment-ID, value)
+    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
+
+    // Index pair data type
+    typedef IndexPair<OffsetT>IndexPair;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                6,                              ///< Items per thread (per tile of input)
+                true,                           ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_LDG,                       ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                256,                            ///< Threads per thread block
+                9,                             ///< Items per thread (per tile of input)
+                BLOCK_LOAD_DIRECT,              ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_LDG,                       ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /// SM10
+    struct Policy100
+    {
+        // ReduceRegionPolicy
+        typedef BlockSegReduceRegionPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                false,                          ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
+                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading values
+                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
+                BLOCK_SCAN_RAKING>              ///< The BlockScan algorithm to use
+            SegReduceRegionPolicy;
+
+        // ReduceRegionByKeyPolicy
+        typedef BlockSegReduceRegionByKeyPolicy<
+                128,                            ///< Threads per thread block
+                3,                              ///< Items per thread (per tile of input)
+                BLOCK_LOAD_WARP_TRANSPOSE,      ///< The BlockLoad algorithm to use
+                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
+                LOAD_DEFAULT,                   ///< Cache load modifier for reading input elements
+                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
+            SegReduceRegionByKeyPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+/*
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+*/
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSegReduceRegionPolicy           : PtxPolicy::SegReduceRegionPolicy {};
+    struct PtxSegReduceRegionByKeyPolicy      : PtxPolicy::SegReduceRegionByKeyPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <
+        typename SegReduceKernelConfig,
+        typename SegReduceByKeyKernelConfig>
+    __host__ __device__ __forceinline__
+    static void InitConfigs(
+        int                         ptx_version,
+        SegReduceKernelConfig       &seg_reduce_region_config,
+        SegReduceByKeyKernelConfig  &seg_reduce_region_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        seg_reduce_region_config.Init<PtxSegReduceRegionPolicy>();
+        seg_reduce_region_by_key_config.Init<PtxSegReduceRegionByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            seg_reduce_region_config.template          Init<typename Policy350::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy350::SegReduceRegionByKeyPolicy>();
+        }
+/*
+        else if (ptx_version >= 300)
+        {
+            seg_reduce_region_config.template          Init<typename Policy300::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy300::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            seg_reduce_region_config.template          Init<typename Policy200::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy200::SegReduceRegionByKeyPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            seg_reduce_region_config.template          Init<typename Policy130::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy130::SegReduceRegionByKeyPolicy>();
+        }
+*/
+        else
+        {
+            seg_reduce_region_config.template          Init<typename Policy100::SegReduceRegionPolicy>();
+            seg_reduce_region_by_key_config.template   Init<typename Policy100::SegReduceRegionByKeyPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * SegReduceRegionKernel kernel dispatch configuration
+     */
+    struct SegReduceKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        bool                    use_smem_segment_cache;
+        bool                    use_smem_value_cache;
+        CacheLoadModifier       load_modifier_segments;
+        CacheLoadModifier       load_modifier_values;
+        BlockReduceAlgorithm    reduce_algorithm;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionPolicy::ITEMS_PER_THREAD;
+            use_smem_segment_cache      = SegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE;
+            use_smem_value_cache        = SegReduceRegionPolicy::USE_SMEM_VALUE_CACHE;
+            load_modifier_segments      = SegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS;
+            load_modifier_values        = SegReduceRegionPolicy::LOAD_MODIFIER_VALUES;
+            reduce_algorithm            = SegReduceRegionPolicy::REDUCE_ALGORITHM;
+            scan_algorithm              = SegReduceRegionPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+    /**
+     * SegReduceRegionByKeyKernel kernel dispatch configuration
+     */
+    struct SegReduceByKeyKernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_algorithm;
+        bool                    load_warp_time_slicing;
+        CacheLoadModifier       load_modifier;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename SegReduceRegionByKeyPolicy>
+        __host__ __device__ __forceinline__
+        void Init()
+        {
+            block_threads               = SegReduceRegionByKeyPolicy::BLOCK_THREADS;
+            items_per_thread            = SegReduceRegionByKeyPolicy::ITEMS_PER_THREAD;
+            load_algorithm              = SegReduceRegionByKeyPolicy::LOAD_ALGORITHM;
+            load_warp_time_slicing      = SegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING;
+            load_modifier               = SegReduceRegionByKeyPolicy::LOAD_MODIFIER;
+            scan_algorithm              = SegReduceRegionByKeyPolicy::SCAN_ALGORITHM;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    template <
+        typename                        SegReducePartitionKernelPtr,
+        typename                        SegReduceRegionKernelPtr,               ///< Function type of cub::SegReduceRegionKernel
+        typename                        SegReduceRegionByKeyKernelPtr>          ///< Function type of cub::SegReduceRegionByKeyKernel
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous,                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                             sm_version,                             ///< [in] SM version of target device to use when computing SM occupancy
+        SegReducePartitionKernelPtr     seg_reduce_partition_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionKernelPtr        seg_reduce_region_kernel,               ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
+        SegReduceRegionByKeyKernelPtr   seg_reduce_region_by_key_kernel,        ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionByKeyKernel
+        SegReduceKernelConfig           &seg_reduce_region_config,              ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_kernel was compiled for
+        SegReduceByKeyKernelConfig      &seg_reduce_region_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_by_key_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Dispatch two kernels: (1) a multi-block segmented reduction
+            // to reduce regions by block, and (2) a single-block reduce-by-key kernel
+            // to "fix up" segments spanning more than one region.
+
+            // Tile size of seg_reduce_region_kernel
+            int tile_size = seg_reduce_region_config.block_threads * seg_reduce_region_config.items_per_thread;
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_region_kernel
+            int seg_reduce_region_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                seg_reduce_region_sm_occupancy,
+                sm_version,
+                seg_reduce_region_kernel,
+                seg_reduce_region_config.block_threads))) break;
+
+            // Get device occupancy for histogram_region_kernel
+            int seg_reduce_region_occupancy = seg_reduce_region_sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int num_diagonals = num_values + num_segments;                  // Total number of work items
+            int subscription_factor = seg_reduce_region_sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
+            int max_grid_size = seg_reduce_region_occupancy * subscription_factor;
+            GridEvenShare<OffsetT>even_share(
+                num_diagonals,
+                max_grid_size,
+                tile_size);
+
+            // Get grid size for seg_reduce_region_kernel
+            int seg_reduce_region_grid_size = even_share.grid_size;
+
+            // Number of "fix-up" reduce-by-key tuples (2 per thread block)
+            int num_tuple_partials = seg_reduce_region_grid_size * 2;
+            int num_partition_samples = seg_reduce_region_grid_size + 1;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                num_tuple_partials * sizeof(KeyValuePair),  // bytes needed for "fix-up" reduce-by-key tuples
+                num_partition_samples * sizeof(IndexPair),  // bytes needed block indices
+            };
+
+            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocations
+            KeyValuePair    *d_tuple_partials   = (KeyValuePair*) allocations[0];           // "fix-up" tuples
+            IndexPair       *d_block_idx        = (IndexPair *) allocations[1];             // block starting/ending indices
+
+            // Array of segment end-offsets
+            SegmentOffsetIterator d_segment_end_offsets = d_segment_offsets + 1;
+
+            // Grid launch params for seg_reduce_partition_kernel
+            int partition_block_size = 32;
+            int partition_grid_size = (num_partition_samples + partition_block_size - 1) / partition_block_size;
+
+            // Partition work among multiple thread blocks if necessary
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_partition_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_partition_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    partition_grid_size, partition_block_size, (long long) stream);
+
+                // Invoke seg_reduce_partition_kernel
+                seg_reduce_partition_kernel<<<partition_grid_size, partition_block_size, 0, stream>>>(
+                    d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
+                    d_block_idx,
+                    num_partition_samples,
+                    num_values,             ///< [in] Number of values to reduce
+                    num_segments,           ///< [in] Number of segments being reduced
+                    even_share);            ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log seg_reduce_region_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking seg_reduce_region_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, (long long) stream, seg_reduce_region_config.items_per_thread, seg_reduce_region_sm_occupancy);
+
+            // Mooch
+            if (CubDebug(error = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte))) break;
+
+            // Invoke seg_reduce_region_kernel
+            seg_reduce_region_kernel<<<seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, 0, stream>>>(
+                d_segment_end_offsets,
+                d_values,
+                d_output,
+                d_tuple_partials,
+                d_block_idx,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                even_share);
+
+            // Sync the stream if specified
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+/*
+            // Perform "fix-up" of region partial reductions if grid size is greater than one thread block
+            if (seg_reduce_region_grid_size > 1)
+            {
+                // Log seg_reduce_region_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking seg_reduce_region_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                    1, seg_reduce_region_by_key_config.block_threads, (long long) stream, seg_reduce_region_by_key_config.items_per_thread);
+
+                // Invoke seg_reduce_region_by_key_kernel
+                seg_reduce_region_by_key_kernel<<<1, seg_reduce_region_by_key_config.block_threads, 0, stream>>>(
+                    d_tuple_partials,
+                    d_output,
+                    num_segments,
+                    num_tuple_partials,
+                    identity,
+                    reduction_op);
+
+                // Sync the stream if specified
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+*/
+        }
+
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide segmented reduction.
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
+        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
+        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                            debug_synchronous)                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            SegReduceKernelConfig seg_reduce_region_config;
+            SegReduceByKeyKernelConfig seg_reduce_region_by_key_config;
+
+            InitConfigs(ptx_version, seg_reduce_region_config, seg_reduce_region_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_values,
+                d_segment_offsets,
+                d_output,
+                num_values,
+                num_segments,
+                identity,
+                reduction_op,
+                stream,
+                debug_synchronous,
+                ptx_version,            // Use PTX version instead of SM version because, as a statically known quantity, this improves device-side launch dramatically but at the risk of imprecise occupancy calculation for mismatches
+                SegReducePartitionKernel<SegmentOffsetIterator, OffsetT>,
+                SegReduceRegionKernel<PtxSegReduceRegionPolicy, SegmentOffsetIterator, ValueIterator, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                SegReduceRegionByKeyKernel<PtxSegReduceRegionByKeyPolicy, KeyValuePair*, OutputIteratorT, ReductionOp, OffsetT, Value>,
+                seg_reduce_region_config,
+                seg_reduce_region_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+
+    }
+};
+
+
+
+
+/******************************************************************************
+ * DeviceSegReduce
+ *****************************************************************************/
+
+/**
+ * \brief DeviceSegReduce provides operations for computing a device-wide, parallel segmented reduction across a sequence of data items residing within global memory.
+ * \ingroup DeviceModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a list of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ */
+struct DeviceSegReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * Does not support non-commutative reduction operators.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     * \tparam Value                    <b>[inferred]</b> Value type
+     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT,
+        typename                Value,
+        typename                ReductionOp>
+    __host__ __device__ __forceinline__
+    static cudaError_t Reduce(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        Value                   identity,                               ///< [in] Identity value (for zero-length segments)
+        ReductionOp             reduction_op,                           ///< [in] Reduction operator
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                ReductionOp,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * Does not support non-commutative summation.
+     *
+     * \devicestorage
+     *
+     * \cdp
+     *
+     * \iterator
+     *
+     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
+     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
+     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
+     */
+    template <
+        typename                ValueIterator,
+        typename                SegmentOffsetIterator,
+        typename                OutputIteratorT>
+    __host__ __device__ __forceinline__
+    static cudaError_t Sum(
+        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
+        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
+        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
+        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
+        int                     num_values,                             ///< [in] Total number of values to reduce
+        int                     num_segments,                           ///< [in] Number of segments being reduced
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Value type
+        typedef typename std::iterator_traits<ValueIterator>::value_type Value;
+
+        Value identity = Value();
+        cub::Sum reduction_op;
+
+        typedef DeviceSegReduceDispatch<
+                ValueIterator,
+                SegmentOffsetIterator,
+                OutputIteratorT,
+                cub::Sum,
+                OffsetT>
+            DeviceSegReduceDispatch;
+
+        return DeviceSegReduceDispatch::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_values,
+            d_segment_offsets,
+            d_output,
+            num_values,
+            num_segments,
+            identity,
+            reduction_op,
+            stream,
+            debug_synchronous);
+    }
+};
+
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename OffsetT, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Value           *h_values,
+    vector<OffsetT> &segment_offsets,
+    int             num_values,
+    int             avg_segment_size)
+{
+    // Initialize values
+//    if (g_verbose) printf("Values: ");
+    for (int i = 0; i < num_values; ++i)
+    {
+        InitValue(gen_mode, h_values[i], i);
+//        if (g_verbose) std::cout << h_values[i] << ", ";
+    }
+//    if (g_verbose) printf("\n\n");
+
+    // Initialize segment lengths
+    const unsigned int  MAX_INTEGER         = -1u;
+    const unsigned int  MAX_SEGMENT_LENGTH  = avg_segment_size * 2;
+    const double        SCALE_FACTOR        = double(MAX_SEGMENT_LENGTH) / double(MAX_INTEGER);
+
+    segment_offsets.push_back(0);
+
+    OffsetT consumed = 0;
+    OffsetT remaining = num_values;
+    while (remaining > 0)
+    {
+        // Randomly sample a 32-bit unsigned int
+        unsigned int segment_length;
+        RandomBits(segment_length);
+
+        // Scale to maximum segment length
+        segment_length = (unsigned int) (double(segment_length) * SCALE_FACTOR);
+        segment_length = CUB_MIN(segment_length, remaining);
+
+        consumed += segment_length;
+        remaining -= segment_length;
+
+        segment_offsets.push_back(consumed);
+    }
+}
+
+
+/**
+ * Compute reference answer
+ */
+template <typename OffsetT, typename Value>
+void ComputeReference(
+    Value       *h_values,
+    OffsetT     *h_segment_offsets,
+    Value       *h_reference,
+    int         num_segments,
+    Value       identity)
+{
+    if (g_verbose) printf("%d segment reductions: ", num_segments);
+    for (int segment = 0; segment < num_segments; ++segment)
+    {
+        h_reference[segment] = identity;
+
+        for (int i = h_segment_offsets[segment]; i < h_segment_offsets[segment + 1]; ++i)
+        {
+            h_reference[segment] += h_values[i];
+        }
+        if (g_verbose) std::cout << h_reference[segment] << ", ";
+    }
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Simple test of device
+ */
+template <
+    bool            CDP,
+    typename        OffsetT,
+    typename        Value,
+    typename        ReductionOp>
+void Test(
+    OffsetT         num_values,
+    int             avg_segment_size,
+    ReductionOp     reduction_op,
+    Value           identity,
+    char*           type_string)
+{
+    Value   *h_values = NULL;
+    Value   *h_reference = NULL;
+    OffsetT *h_segment_offsets = NULL;
+
+    printf("%d\n", num_values);
+
+    // Initialize problem on host
+    h_values = new Value[num_values];
+    vector<OffsetT> segment_offsets;
+    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
+
+    // Allocate simple offsets array and copy STL vector into it
+    h_segment_offsets = new OffsetT[segment_offsets.size()];
+    for (int i = 0; i < segment_offsets.size(); ++i)
+        h_segment_offsets[i] = segment_offsets[i];
+
+    OffsetT num_segments = segment_offsets.size() - 1;
+    if (g_verbose)
+    {
+        printf("%d segment offsets: ", num_segments);
+        for (int i = 0; i < num_segments; ++i)
+            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
+        if (g_verbose) std::cout << std::endl << std::endl;
+    }
+
+    // Solve problem on host
+    h_reference = new Value[num_segments];
+    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
+
+    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
+        (CDP) ? "CDP device invoked" : "Host-invoked",
+        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
+        num_values, (int) sizeof(Value), type_string,
+        num_segments, (int) sizeof(OffsetT));
+    fflush(stdout);
+
+    // Allocate and initialize problem on device
+    Value   *d_values = NULL;
+    OffsetT *d_segment_offsets = NULL;
+    Value   *d_output = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Request and allocate temporary storage
+    void    *d_temp_storage = NULL;
+    size_t  temp_storage_bytes = 0;
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output
+    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
+    }
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
+        float giga_bandwidth = giga_rate *
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    // Device cleanup
+    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Host cleanup
+    if (h_values)           delete[] h_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (h_reference)        delete[] h_reference;
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_values          = 32 * 1024 * 1024;
+    int avg_segment_size    = 500;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_values);
+    args.GetCmdLineArgument("ss", avg_segment_size);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--n=<input samples>]\n"
+            "[--ss=<average segment size>]\n"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/experimental/histogram/histogram_cub.h b/external/cub/experimental/histogram/histogram_cub.h
new file mode 100644
index 00000000000..f33184a58b9
--- /dev/null
+++ b/external/cub/experimental/histogram/histogram_cub.h
@@ -0,0 +1,109 @@
+/******************************************************************************
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <cub/device/device_histogram.cuh>
+
+using namespace cub;
+
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_cub_histogram(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool is_warmup)
+{
+    enum {
+        is_float = Equals<PixelType, float4>::VALUE,
+    };
+
+    typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
+    typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
+
+    // Setup data structures
+    unsigned int*       d_histogram[ACTIVE_CHANNELS];
+    int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
+        num_levels[CHANNEL] = NUM_BINS + 1;
+        lower_level[CHANNEL] = 0;
+        upper_level[CHANNEL] = (is_float) ? 1 : 256;
+    }
+
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    SampleT* d_image_samples = (SampleT*) d_image;
+
+    // Get amount of temporary storage needed
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    cudaMalloc(&d_temp_storage, temp_storage_bytes);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    // Compute histogram
+    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
+        d_temp_storage,
+        temp_storage_bytes,
+        d_image_samples,
+        d_histogram,
+        num_levels,
+        lower_level,
+        upper_level,
+        width * height, 
+        (cudaStream_t) 0,
+        is_warmup);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_temp_storage);
+
+    return elapsed_millis;
+}
+
diff --git a/external/cub/experimental/histogram/histogram_gmem_atomics.h b/external/cub/experimental/histogram/histogram_gmem_atomics.h
new file mode 100644
index 00000000000..c3c9630d2e4
--- /dev/null
+++ b/external/cub/experimental/histogram/histogram_gmem_atomics.h
@@ -0,0 +1,185 @@
+/******************************************************************************
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_gmem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_gmem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        unsigned int *gmem = out + g * NUM_PARTS;
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
+            gmem[i] = 0;
+        __syncthreads();
+
+        // process pixels (updates our group's partial histogram in gmem)
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
+            }
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_gmem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS)
+            return; // out of range
+
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+
+        out[i] = total;
+    }
+
+
+}   // namespace histogram_gmem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_gmem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist,
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/external/cub/experimental/histogram/histogram_smem_atomics.h b/external/cub/experimental/histogram/histogram_smem_atomics.h
new file mode 100644
index 00000000000..5703d81133f
--- /dev/null
+++ b/external/cub/experimental/histogram/histogram_smem_atomics.h
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <test/test_util.h>
+
+namespace histogram_smem_atomics
+{
+    // Decode float4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        float* samples = reinterpret_cast<float*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+    }
+
+    // Decode uchar4 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+    }
+
+    // Decode uchar1 pixel into bins
+    template <int NUM_BINS, int ACTIVE_CHANNELS>
+    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+    {
+        bins[0] = (unsigned int) pixel.x;
+    }
+
+    // First-pass histogram kernel (binning into privatized counters)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS,
+        typename    PixelType>
+    __global__ void histogram_smem_atomics(
+        const PixelType *in,
+        int width,
+        int height,
+        unsigned int *out)
+    {
+        // global position and size
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+        int nx = blockDim.x * gridDim.x;
+        int ny = blockDim.y * gridDim.y;
+
+        // threads in workgroup
+        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
+        int nt = blockDim.x * blockDim.y; // total threads in workgroup
+
+        // group index in 0..ngroups-1
+        int g = blockIdx.x + blockIdx.y * gridDim.x;
+
+        // initialize smem
+        __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
+        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
+            smem[i] = 0;
+        __syncthreads();
+
+        // process pixels
+        // updates our group's partial histogram in smem
+        for (int col = x; col < width; col += nx)
+        {
+            for (int row = y; row < height; row += ny)
+            {
+                PixelType pixel = in[row * width + col];
+
+                unsigned int bins[ACTIVE_CHANNELS];
+                DecodePixel<NUM_BINS>(pixel, bins);
+
+                #pragma unroll
+                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                    atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
+            }
+        }
+
+        __syncthreads();
+
+        // move to our workgroup's slice of output
+        out += g * NUM_PARTS;
+
+        // store local output to global
+        for (int i = t; i < NUM_BINS; i += nt)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+                out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
+        }
+    }
+
+    // Second pass histogram kernel (accumulation)
+    template <
+        int         NUM_PARTS,
+        int         ACTIVE_CHANNELS,
+        int         NUM_BINS>
+    __global__ void histogram_smem_accum(
+        const unsigned int *in,
+        int n,
+        unsigned int *out)
+    {
+        int i = blockIdx.x * blockDim.x + threadIdx.x;
+        if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
+        unsigned int total = 0;
+        for (int j = 0; j < n; j++)
+            total += in[i + NUM_PARTS * j];
+        out[i] = total;
+    }
+
+}   // namespace histogram_smem_atomics
+
+
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+double run_smem_atomics(
+    PixelType *d_image,
+    int width,
+    int height,
+    unsigned int *d_hist, 
+    bool warmup)
+{
+    enum
+    {
+        NUM_PARTS = 1024
+    };
+
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, 0);
+
+    dim3 block(32, 4);
+    dim3 grid(16, 16);
+    int total_blocks = grid.x * grid.y;
+
+    // allocate partial histogram
+    unsigned int *d_part_hist;
+    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
+
+    dim3 block2(128);
+    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
+
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
+        d_image,
+        width,
+        height,
+        d_part_hist);
+
+    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
+        d_part_hist,
+        total_blocks,
+        d_hist);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    cudaFree(d_part_hist);
+
+    return elapsed_millis;
+}
+
diff --git a/external/cub/experimental/histogram_compare.cu b/external/cub/experimental/histogram_compare.cu
new file mode 100644
index 00000000000..0c72aafa847
--- /dev/null
+++ b/external/cub/experimental/histogram_compare.cu
@@ -0,0 +1,635 @@
+/******************************************************************************
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include "histogram/histogram_gmem_atomics.h"
+#include "histogram/histogram_smem_atomics.h"
+#include "histogram/histogram_cub.h"
+
+#include <cub/util_allocator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+bool                    g_verbose = false;  // Whether to display input/output to console
+bool                    g_report = false;   // Whether to display a full report in CSV format
+CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
+
+struct less_than_value
+{
+    inline bool operator()(
+        const std::pair<std::string, double> &a,
+        const std::pair<std::string, double> &b)
+    {
+        return a.second < b.second;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Targa (.tga) image file parsing
+//---------------------------------------------------------------------
+
+/**
+ * TGA image header info
+ */
+struct TgaHeader
+{
+    char idlength;
+    char colormaptype;
+    char datatypecode;
+    short colormaporigin;
+    short colormaplength;
+    char colormapdepth;
+    short x_origin;
+    short y_origin;
+    short width;
+    short height;
+    char bitsperpixel;
+    char imagedescriptor;
+
+    void Parse (FILE *fptr)
+    {
+        idlength = fgetc(fptr);
+        colormaptype = fgetc(fptr);
+        datatypecode = fgetc(fptr);
+        fread(&colormaporigin, 2, 1, fptr);
+        fread(&colormaplength, 2, 1, fptr);
+        colormapdepth = fgetc(fptr);
+        fread(&x_origin, 2, 1, fptr);
+        fread(&y_origin, 2, 1, fptr);
+        fread(&width, 2, 1, fptr);
+        fread(&height, 2, 1, fptr);
+        bitsperpixel = fgetc(fptr);
+        imagedescriptor = fgetc(fptr);
+    }
+
+    void Display (FILE *fptr)
+    {
+        fprintf(fptr, "ID length:           %d\n", idlength);
+        fprintf(fptr, "Color map type:      %d\n", colormaptype);
+        fprintf(fptr, "Image type:          %d\n", datatypecode);
+        fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
+        fprintf(fptr, "Color map length:    %d\n", colormaplength);
+        fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
+        fprintf(fptr, "X origin:            %d\n", x_origin);
+        fprintf(fptr, "Y origin:            %d\n", y_origin);
+        fprintf(fptr, "Width:               %d\n", width);
+        fprintf(fptr, "Height:              %d\n", height);
+        fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
+        fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
+    }
+};
+
+
+/**
+ * Decode image byte data into pixel
+ */
+void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
+{
+    if (bytes == 4)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = tga_pixel[3];
+    }
+    else if (bytes == 3)
+    {
+        pixel.x = tga_pixel[2];
+        pixel.y = tga_pixel[1];
+        pixel.z = tga_pixel[0];
+        pixel.w = 0;
+    }
+    else if (bytes == 2)
+    {
+        pixel.x = (tga_pixel[1] & 0x7c) << 1;
+        pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
+        pixel.z = (tga_pixel[0] & 0x1f) << 3;
+        pixel.w = (tga_pixel[1] & 0x80);
+    }
+}
+
+
+/**
+ * Reads a .tga image file
+ */
+void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
+{
+    // Open the file
+    FILE *fptr;
+    if ((fptr = fopen(filename, "rb")) == NULL)
+    {
+        fprintf(stderr, "File open failed\n");
+        exit(-1);
+    }
+
+    // Parse header
+    TgaHeader header;
+    header.Parse(fptr);
+//    header.Display(stdout);
+    width = header.width;
+    height = header.height;
+
+    // Verify compatibility
+    if (header.datatypecode != 2 && header.datatypecode != 10)
+    {
+        fprintf(stderr, "Can only handle image type 2 and 10\n");
+        exit(-1);
+    }
+    if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
+    {
+        fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
+        exit(-1);
+    }
+    if (header.colormaptype != 0 && header.colormaptype != 1)
+    {
+        fprintf(stderr, "Can only handle color map types of 0 and 1\n");
+        exit(-1);
+    }
+
+    // Skip unnecessary header info
+    int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
+    fseek(fptr, skip_bytes, SEEK_CUR);
+
+    // Read the image
+    int pixel_bytes = header.bitsperpixel / 8;
+
+    // Allocate and initialize pixel data
+    size_t image_bytes = width * height * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+    memset(pixels, 0, image_bytes);
+
+    // Parse pixels
+    unsigned char   tga_pixel[5];
+    int             current_pixel = 0;
+    while (current_pixel < header.width * header.height)
+    {
+        if (header.datatypecode == 2)
+        {
+            // Uncompressed
+            if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
+                exit(-1);
+            }
+            ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+            current_pixel++;
+        }
+        else if (header.datatypecode == 10)
+        {
+            // Compressed
+            if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
+            {
+                fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
+                exit(-1);
+            }
+            int run_length = tga_pixel[0] & 0x7f;
+            ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+            current_pixel++;
+
+            if (tga_pixel[0] & 0x80)
+            {
+                // RLE chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
+                    current_pixel++;
+                }
+            }
+            else
+            {
+                // Normal chunk
+                for (int i = 0; i < run_length; i++)
+                {
+                    if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
+                    {
+                        fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
+                        exit(-1);
+                    }
+                    ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
+                    current_pixel++;
+                }
+            }
+        }
+    }
+
+    // Close file
+    fclose(fptr);
+}
+
+
+
+//---------------------------------------------------------------------
+// Random image generation
+//---------------------------------------------------------------------
+
+/**
+ * Generate a random image with specified entropy
+ */
+void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
+{
+    int num_pixels = width * height;
+    size_t image_bytes = num_pixels * sizeof(uchar4);
+    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
+    {
+        fprintf(stderr, "malloc of image failed\n");
+        exit(-1);
+    }
+
+    for (int i = 0; i < num_pixels; ++i)
+    {
+        RandomBits(pixels[i].x, entropy_reduction);
+        RandomBits(pixels[i].y, entropy_reduction);
+        RandomBits(pixels[i].z, entropy_reduction);
+        RandomBits(pixels[i].w, entropy_reduction);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Histogram verification
+//---------------------------------------------------------------------
+
+// Decode float4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    float* samples = reinterpret_cast<float*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
+}
+
+// Decode uchar4 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
+
+    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
+}
+
+// Decode uchar1 pixel into bins
+template <int NUM_BINS, int ACTIVE_CHANNELS>
+void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
+{
+    bins[0] = (unsigned int) pixel.x;
+}
+
+
+// Compute reference histogram.  Specialized for uchar4
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
+{
+    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
+
+    for (int i = 0; i < width; i++)
+    {
+        for (int j = 0; j < height; j++)
+        {
+            PixelType pixel = image[i + j * width];
+
+            unsigned int bins[ACTIVE_CHANNELS];
+            DecodePixelGold<NUM_BINS>(pixel, bins);
+
+            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
+            }
+        }
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Test execution
+//---------------------------------------------------------------------
+
+/**
+ * Run a specific histogram implementation
+ */
+template <
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void RunTest(
+    std::vector<std::pair<std::string, double> >&   timings,
+    PixelType*                                      d_pixels,
+    const int                                       width,
+    const int                                       height,
+    unsigned int *                                  d_hist,
+    unsigned int *                                  h_hist,
+    int                                             timing_iterations,
+    const char *                                    long_name,
+    const char *                                    short_name,
+    double (*f)(PixelType*, int, int, unsigned int*, bool))
+{
+    if (!g_report) printf("%s ", long_name); fflush(stdout);
+
+    // Run single test to verify (and code cache)
+    (*f)(d_pixels, width, height, d_hist, !g_report);
+
+    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
+    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+
+    double elapsed_ms = 0;
+    for (int i = 0; i < timing_iterations; i++)
+    {
+        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
+    }
+    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
+    timings.push_back(std::pair<std::string, double>(short_name, avg_us));
+
+    if (!g_report)
+    {
+        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
+    }
+    else
+    {
+        printf("%.3f, ", avg_us); fflush(stdout);
+    }
+
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Evaluate corpus of histogram implementations
+ */
+template <
+    int         NUM_CHANNELS,
+    int         ACTIVE_CHANNELS,
+    int         NUM_BINS,
+    typename    PixelType>
+void TestMethods(
+    PixelType*  h_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    // Copy data to gpu
+    PixelType* d_pixels;
+    size_t pixel_bytes = width * height * sizeof(PixelType);
+    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
+    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
+
+    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
+
+    // Allocate results arrays on cpu/gpu
+    unsigned int *h_hist;
+    unsigned int *d_hist;
+    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
+    h_hist = (unsigned int *) malloc(histogram_bytes);
+    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
+
+    // Compute reference cpu histogram
+    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
+
+    // Store timings
+    std::vector<std::pair<std::string, double> > timings;
+
+    // Run experiments
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
+        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
+
+    // Report timings
+    if (!g_report)
+    {
+        std::sort(timings.begin(), timings.end(), less_than_value());
+        printf("Timings (us):\n");
+        for (int i = 0; i < timings.size(); i++)
+        {
+            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
+            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
+        }
+        printf("\n");
+    }
+
+    // Free data
+    CubDebugExit(g_allocator.DeviceFree(d_pixels));
+    CubDebugExit(g_allocator.DeviceFree(d_hist));
+    free(h_hist);
+}
+
+
+/**
+ * Test different problem genres
+ */
+void TestGenres(
+    uchar4*     uchar4_pixels,
+    int         height,
+    int         width,
+    int         timing_iterations,
+    double      bandwidth_GBs)
+{
+    int num_pixels = width * height;
+
+    {
+        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
+
+        size_t      image_bytes     = num_pixels * sizeof(uchar1);
+        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
+
+        // Convert to 1-channel (averaging first 3 channels)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            uchar1_pixels[i].x = (unsigned char)
+                (((unsigned int) uchar4_pixels[i].x +
+                  (unsigned int) uchar4_pixels[i].y +
+                  (unsigned int) uchar4_pixels[i].z) / 3);
+        }
+
+        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(uchar1_pixels);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
+        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        if (g_report) printf(", ");
+    }
+
+    {
+        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
+        size_t      image_bytes     = num_pixels * sizeof(float4);
+        float4*     float4_pixels   = (float4*) malloc(image_bytes);
+
+        // Convert to float4 with range [0.0, 1.0)
+        for (int i = 0; i < num_pixels; ++i)
+        {
+            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
+            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
+            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
+            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
+        }
+        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
+        free(float4_pixels);
+        if (g_report) printf("\n");
+    }
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "\n\t"
+                "--file=<.tga filename> "
+            "\n\t"
+                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
+                "[--height=<default: 1080>] "
+                "[--width=<default: 1920>] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    std::string         filename;
+    int                 timing_iterations   = 100;
+    int                 entropy_reduction   = 0;
+    int                 height              = 1080;
+    int                 width               = 1920;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_report = args.CheckCmdLineFlag("report");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("file", filename);
+    args.GetCmdLineArgument("height", height);
+    args.GetCmdLineArgument("width", width);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get GPU device bandwidth (GB/s)
+    int device_ordinal, bus_width, mem_clock_khz;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
+    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
+
+    // Run test(s)
+    uchar4* uchar4_pixels = NULL;
+    if (!g_report)
+    {
+        if (!filename.empty())
+        {
+            // Parse targa file
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
+        }
+        else
+        {
+            // Generate image
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
+        }
+
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+    }
+    else
+    {
+        // Run test suite
+        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
+
+        // Entropy reduction tests
+        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
+        {
+            printf("entropy reduction %d, ", entropy_reduction);
+            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+        printf("entropy reduction -1, ");
+        GenerateRandomImage(uchar4_pixels, width, height, -1);
+        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        printf("\n");
+
+        // File image tests
+        std::vector<std::string> file_tests;
+        file_tests.push_back("animals");
+        file_tests.push_back("apples");
+        file_tests.push_back("sunset");
+        file_tests.push_back("cheetah");
+        file_tests.push_back("nature");
+        file_tests.push_back("operahouse");
+        file_tests.push_back("austin");
+        file_tests.push_back("cityscape");
+
+        for (int i = 0; i < file_tests.size(); ++i)
+        {
+            printf("%s, ", file_tests[i].c_str());
+            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
+            ReadTga(uchar4_pixels, width, height, filename.c_str());
+            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
+        }
+    }
+
+    free(uchar4_pixels);
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n\n");
+
+    return 0;
+}
diff --git a/external/cub/experimental/sparse_matrix.h b/external/cub/experimental/sparse_matrix.h
new file mode 100644
index 00000000000..5ac34a1de53
--- /dev/null
+++ b/external/cub/experimental/sparse_matrix.h
@@ -0,0 +1,1244 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Matrix data structures and parsing logic
+ ******************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cstring>
+
+#include <iterator>
+#include <string>
+#include <algorithm>
+#include <iostream>
+#include <queue>
+#include <set>
+#include <fstream>
+#include <stdio.h>
+
+#ifdef CUB_MKL
+    #include <numa.h>
+    #include <mkl.h>
+#endif
+
+using namespace std;
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+struct GraphStats
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+
+    double      diag_dist_mean;         // mean
+    double      diag_dist_std_dev;      // sample std dev
+    double      pearson_r;    // coefficient of variation
+
+    double      row_length_mean;        // mean
+    double      row_length_std_dev;     // sample std_dev
+    double      row_length_variation;   // coefficient of variation
+    double      row_length_skewness;    // skewness
+
+    void Display(bool show_labels = true)
+    {
+        if (show_labels)
+            printf("\n"
+                "\t num_rows: %d\n"
+                "\t num_cols: %d\n"
+                "\t num_nonzeros: %d\n"
+                "\t diag_dist_mean: %.2f\n"
+                "\t diag_dist_std_dev: %.2f\n"
+                "\t pearson_r: %f\n"
+                "\t row_length_mean: %.5f\n"
+                "\t row_length_std_dev: %.5f\n"
+                "\t row_length_variation: %.5f\n"
+                "\t row_length_skewness: %.5f\n",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+        else
+            printf(
+                "%d, "
+                "%d, "
+                "%d, "
+                "%.2f, "
+                "%.2f, "
+                "%f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, "
+                "%.5f, ",
+                    num_rows,
+                    num_cols,
+                    num_nonzeros,
+                    diag_dist_mean,
+                    diag_dist_std_dev,
+                    pearson_r,
+                    row_length_mean,
+                    row_length_std_dev,
+                    row_length_variation,
+                    row_length_skewness);
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * COO matrix type.  A COO matrix is just a vector of edge tuples.  Tuples are sorted
+ * first by row, then by column.
+ */
+template<typename ValueT, typename OffsetT>
+struct CooMatrix
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // COO edge tuple
+    struct CooTuple
+    {
+        OffsetT            row;
+        OffsetT            col;
+        ValueT             val;
+
+        CooTuple() {}
+        CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {}
+        CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {}
+
+        /**
+         * Comparator for sorting COO sparse format num_nonzeros
+         */
+        bool operator<(const CooTuple &other) const
+        {
+            if ((row < other.row) || ((row == other.row) && (col < other.col)))
+            {
+                return true;
+            }
+
+            return false;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Data members
+    //---------------------------------------------------------------------
+
+    // Fields
+    int                 num_rows;
+    int                 num_cols;
+    int                 num_nonzeros;
+    CooTuple*           coo_tuples;
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    // Constructor
+    CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(NULL) {}
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+        if (coo_tuples) delete[] coo_tuples;
+        coo_tuples = NULL;
+    }
+
+
+    // Destructor
+    ~CooMatrix()
+    {
+        Clear();
+    }
+
+
+    // Display matrix to stdout
+    void Display()
+    {
+        cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n";
+        cout << "Ordinal, Row, Column, Value\n";
+        for (int i = 0; i < num_nonzeros; i++)
+        {
+            cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n";
+        }
+    }
+
+
+    /**
+     * Builds a symmetric COO sparse from an asymmetric CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrSymmetric(CsrMatrixT &csr_matrix)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_cols;
+        num_cols        = csr_matrix.num_rows;
+        num_nonzeros    = csr_matrix.num_nonzeros * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < csr_matrix.num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = row;
+                coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row;
+                coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero];
+
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+    /**
+     * Builds a COO sparse from a relabeled CSR matrix.
+     */
+    template <typename CsrMatrixT>
+    void InitCsrRelabel(CsrMatrixT &csr_matrix, OffsetT* relabel_indices)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = csr_matrix.num_rows;
+        num_cols        = csr_matrix.num_cols;
+        num_nonzeros    = csr_matrix.num_nonzeros;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
+            {
+                coo_tuples[nonzero].row = relabel_indices[row];
+                coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]];
+                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+    }
+
+
+
+    /**
+     * Builds a METIS COO sparse from the given file.
+     */
+    void InitMetis(const string &metis_filename)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        // TODO
+    }
+
+
+    /**
+     * Builds a MARKET COO sparse from the given file.
+     */
+    void InitMarket(
+        const string&   market_filename,
+        ValueT          default_value       = 1.0,
+        bool            verbose             = false)
+    {
+        if (verbose) {
+            printf("Reading... "); fflush(stdout);
+        }
+
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        std::ifstream ifs;
+        ifs.open(market_filename.c_str(), std::ifstream::in);
+        if (!ifs.good())
+        {
+            fprintf(stderr, "Error opening file\n");
+            exit(1);
+        }
+
+        bool    array = false;
+        bool    symmetric = false;
+        bool    skew = false;
+        int     current_edge = -1;
+        char    line[1024];
+
+        if (verbose) {
+            printf("Parsing... "); fflush(stdout);
+        }
+
+        while (true)
+        {
+            ifs.getline(line, 1024);
+            if (!ifs.good())
+            {
+                // Done
+                break;
+            }
+
+            if (line[0] == '%')
+            {
+                // Comment
+                if (line[1] == '%')
+                {
+                    // Banner
+                    symmetric   = (strstr(line, "symmetric") != NULL);
+                    skew        = (strstr(line, "skew") != NULL);
+                    array       = (strstr(line, "array") != NULL);
+
+                    if (verbose) {
+                        printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout);
+                    }
+                }
+            }
+            else if (current_edge == -1)
+            {
+                // Problem description
+                int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros);
+                if ((!array) && (nparsed == 3))
+                {
+                    if (symmetric)
+                        num_nonzeros *= 2;
+
+                    // Allocate coo matrix
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+
+                }
+                else if (array && (nparsed == 2))
+                {
+                    // Allocate coo matrix
+                    num_nonzeros = num_rows * num_cols;
+                    coo_tuples = new CooTuple[num_nonzeros];
+                    current_edge = 0;
+                }
+                else
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line);
+                    exit(1);
+                }
+
+            }
+            else
+            {
+                // Edge
+                if (current_edge >= num_nonzeros)
+                {
+                    fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros);
+                    exit(1);
+                }
+
+                int row, col;
+                double val;
+
+                if (array)
+                {
+                    if (sscanf(line, "%lf", &val) != 1)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge);
+                        exit(1);
+                    }
+                    col = (current_edge / num_rows);
+                    row = (current_edge - (num_rows * col));
+
+                    coo_tuples[current_edge] = CooTuple(row, col, val);    // Convert indices to zero-based
+                }
+                else
+                {
+                    // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing)
+                    char *l = line;
+                    char *t = NULL;
+
+                    // parse row
+                    row = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse col
+                    col = strtol(l, &t, 0);
+                    if (t == l)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge);
+                        exit(1);
+                    }
+                    l = t;
+
+                    // parse val
+                    val = strtod(l, &t);
+                    if (t == l)
+                    {
+                        val = default_value;
+                    }
+/*
+                    int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val);
+                    if (nparsed == 2)
+                    {
+                        // No value specified
+                        val = default_value;
+                        
+                    }
+                    else if (nparsed != 3)
+                    {
+                        fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge);
+                        exit(1);
+                    }
+*/
+
+                    coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val);    // Convert indices to zero-based
+
+                }
+
+                current_edge++;
+
+                if (symmetric && (row != col))
+                {
+                    coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col;
+                    coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row;
+                    coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Adjust nonzero count (nonzeros along the diagonal aren't reversed)
+        num_nonzeros = current_edge;
+
+        if (verbose) {
+            printf("done. Ordering..."); fflush(stdout);
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        if (verbose) {
+            printf("done. "); fflush(stdout);
+        }
+
+        ifs.close();
+    }
+
+
+    /**
+     * Builds a dense matrix
+     */
+    int InitDense(
+        OffsetT     num_rows,
+        OffsetT     num_cols,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        this->num_rows  = num_rows;
+        this->num_cols  = num_cols;
+
+        num_nonzeros    = num_rows * num_cols;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            for (OffsetT col = 0; col < num_cols; ++col)
+            {
+                coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value);
+            }
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+    /**
+     * Builds a wheel COO sparse matrix having spokes spokes.
+     */
+    int InitWheel(
+        OffsetT     spokes,
+        ValueT      default_value   = 1.0,
+        bool        verbose         = false)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        num_rows        = spokes + 1;
+        num_cols        = num_rows;
+        num_nonzeros    = spokes * 2;
+        coo_tuples      = new CooTuple[num_nonzeros];
+
+        // Add spoke num_nonzeros
+        int current_edge = 0;
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            coo_tuples[current_edge] = CooTuple(0, i + 1, default_value);
+            current_edge++;
+        }
+
+        // Add rim
+        for (OffsetT i = 0; i < spokes; i++)
+        {
+            OffsetT dest = (i + 1) % spokes;
+            coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value);
+            current_edge++;
+        }
+
+        // Sort by rows, then columns
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 2D grid CSR matrix.  Interior num_vertices have degree 5 when including
+     * a self-loop.
+     *
+     * Returns 0 on success, 1 on failure.
+     */
+    int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            exit(1);
+        }
+
+        int     interior_nodes  = (width - 2) * (width - 2);
+        int     edge_nodes      = (width - 2) * 4;
+        int     corner_nodes    = 4;
+        num_rows                       = width * width;
+        num_cols                       = num_rows;
+        num_nonzeros                   = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT j = 0; j < width; j++)
+        {
+            for (OffsetT k = 0; k < width; k++)
+            {
+                OffsetT me = (j * width) + k;
+
+                // West
+                OffsetT neighbor = (j * width) + (k - 1);
+                if (k - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // East
+                neighbor = (j * width) + (k + 1);
+                if (k + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // North
+                neighbor = ((j - 1) * width) + k;
+                if (j - 1 >= 0) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                // South
+                neighbor = ((j + 1) * width) + k;
+                if (j + 1 < width) {
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+
+                if (self_loop)
+                {
+                    neighbor = me;
+                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                    current_edge++;
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+
+
+    /**
+     * Builds a square 3D grid COO sparse matrix.  Interior num_vertices have degree 7 when including
+     * a self-loop.  Values are unintialized, coo_tuples are sorted.
+     */
+    int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
+    {
+        if (coo_tuples)
+        {
+            fprintf(stderr, "Matrix already constructed\n");
+            return -1;
+        }
+
+        OffsetT interior_nodes  = (width - 2) * (width - 2) * (width - 2);
+        OffsetT face_nodes      = (width - 2) * (width - 2) * 6;
+        OffsetT edge_nodes      = (width - 2) * 12;
+        OffsetT corner_nodes    = 8;
+        num_cols                       = width * width * width;
+        num_rows                       = num_cols;
+        num_nonzeros                     = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3);
+
+        if (self_loop)
+            num_nonzeros += num_rows;
+
+        coo_tuples          = new CooTuple[num_nonzeros];
+        int current_edge    = 0;
+
+        for (OffsetT i = 0; i < width; i++)
+        {
+            for (OffsetT j = 0; j < width; j++)
+            {
+                for (OffsetT k = 0; k < width; k++)
+                {
+
+                    OffsetT me = (i * width * width) + (j * width) + k;
+
+                    // Up
+                    OffsetT neighbor = (i * width * width) + (j * width) + (k - 1);
+                    if (k - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // Down
+                    neighbor = (i * width * width) + (j * width) + (k + 1);
+                    if (k + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // West
+                    neighbor = (i * width * width) + ((j - 1) * width) + k;
+                    if (j - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // East
+                    neighbor = (i * width * width) + ((j + 1) * width) + k;
+                    if (j + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // North
+                    neighbor = ((i - 1) * width * width) + (j * width) + k;
+                    if (i - 1 >= 0) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    // South
+                    neighbor = ((i + 1) * width * width) + (j * width) + k;
+                    if (i + 1 < width) {
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+
+                    if (self_loop)
+                    {
+                        neighbor = me;
+                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
+                        current_edge++;
+                    }
+                }
+            }
+        }
+
+        // Sort by rows, then columns, update dims
+        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
+
+        return 0;
+    }
+};
+
+
+
+/******************************************************************************
+ * COO matrix type
+ ******************************************************************************/
+
+
+/**
+ * CSR sparse format matrix
+ */
+template<
+    typename ValueT,
+    typename OffsetT>
+struct CsrMatrix
+{
+    int         num_rows;
+    int         num_cols;
+    int         num_nonzeros;
+    OffsetT*    row_offsets;
+    OffsetT*    column_indices;
+    ValueT*     values;
+    bool        numa_malloc;
+
+    /**
+     * Constructor
+     */
+    CsrMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), row_offsets(NULL), column_indices(NULL), values(NULL) 
+    {
+#ifdef CUB_MKL
+        numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1));
+#else
+        numa_malloc = false;
+#endif
+    }
+
+
+    /**
+     * Clear
+     */
+    void Clear()
+    {
+#ifdef CUB_MKL
+        if (numa_malloc) 
+        {
+            numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1));
+            numa_free(values, sizeof(ValueT) * num_nonzeros);
+            numa_free(column_indices, sizeof(OffsetT) * num_nonzeros);
+        }
+        else
+        {
+            if (row_offsets)    mkl_free(row_offsets);
+            if (column_indices) mkl_free(column_indices);
+            if (values)         mkl_free(values);
+        }
+
+#else
+        if (row_offsets)    delete[] row_offsets;
+        if (column_indices) delete[] column_indices;
+        if (values)         delete[] values;
+#endif
+
+        row_offsets = NULL;
+        column_indices = NULL;
+        values = NULL;
+    }
+
+    /**
+     * Destructor
+     */
+    ~CsrMatrix()
+    {
+        Clear();
+    }
+
+    GraphStats Stats()
+    {
+        GraphStats stats;
+        stats.num_rows = num_rows;
+        stats.num_cols = num_cols;
+        stats.num_nonzeros = num_nonzeros;
+
+        //
+        // Compute diag-distance statistics
+        //
+
+        OffsetT samples     = 0;
+        double  mean        = 0.0;
+        double  ss_tot      = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+                double x                = (col > row) ? col - row : row - col;
+
+                samples++;
+                double delta            = x - mean;
+                mean                    = mean + (delta / samples);
+                ss_tot                  += delta * (x - mean);
+            }
+        }
+        stats.diag_dist_mean            = mean;
+        double variance                 = ss_tot / samples;
+        stats.diag_dist_std_dev         = sqrt(variance);
+
+
+        //
+        // Compute deming statistics
+        //
+
+        samples         = 0;
+        double mean_x   = 0.0;
+        double mean_y   = 0.0;
+        double ss_x     = 0.0;
+        double ss_y     = 0.0;
+
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+                double delta;
+
+                delta                   = x - mean_x;
+                mean_x                  = mean_x + (delta / samples);
+                ss_x                    += delta * (x - mean_x);
+
+                delta                   = y - mean_y;
+                mean_y                  = mean_y + (delta / samples);
+                ss_y                    += delta * (y - mean_y);
+            }
+        }
+
+        samples         = 0;
+        double s_xy     = 0.0;
+        double s_xxy    = 0.0;
+        double s_xyy    = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT nz_idx_start    = row_offsets[row];
+            OffsetT nz_idx_end      = row_offsets[row + 1];
+
+            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
+            {
+                OffsetT col             = column_indices[nz_idx];
+
+                samples++;
+                double x                = col;
+                double y                = row;
+
+                double xy =             (x - mean_x) * (y - mean_y);
+                double xxy =            (x - mean_x) * (x - mean_x) * (y - mean_y);
+                double xyy =            (x - mean_x) * (y - mean_y) * (y - mean_y);
+                double delta;
+
+                delta                   = xy - s_xy;
+                s_xy                    = s_xy + (delta / samples);
+
+                delta                   = xxy - s_xxy;
+                s_xxy                   = s_xxy + (delta / samples);
+
+                delta                   = xyy - s_xyy;
+                s_xyy                   = s_xyy + (delta / samples);
+            }
+        }
+
+        double s_xx     = ss_x / num_nonzeros;
+        double s_yy     = ss_y / num_nonzeros;
+
+        double deming_slope = (s_yy - s_xx + sqrt(((s_yy - s_xx) * (s_yy - s_xx)) + (4 * s_xy * s_xy))) / (2 * s_xy);
+
+        stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y));
+
+
+        //
+        // Compute row-length statistics
+        //
+
+        // Sample mean
+        stats.row_length_mean       = double(num_nonzeros) / num_rows;
+        variance                    = 0.0;
+        stats.row_length_skewness   = 0.0;
+        for (OffsetT row = 0; row < num_rows; ++row)
+        {
+            OffsetT length              = row_offsets[row + 1] - row_offsets[row];
+            double delta                = double(length) - stats.row_length_mean;
+            variance   += (delta * delta);
+            stats.row_length_skewness   += (delta * delta * delta);
+        }
+        variance                    /= num_rows;
+        stats.row_length_std_dev    = sqrt(variance);
+        stats.row_length_skewness   = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0);
+        stats.row_length_variation  = stats.row_length_std_dev / stats.row_length_mean;
+
+        return stats;
+    }
+
+    /**
+     * Build CSR matrix from sorted COO matrix
+     */
+    void FromCoo(const CooMatrix<ValueT, OffsetT> &coo_matrix)
+    {
+        num_rows        = coo_matrix.num_rows;
+        num_cols        = coo_matrix.num_cols;
+        num_nonzeros    = coo_matrix.num_nonzeros;
+
+#ifdef CUB_MKL
+
+        if (numa_malloc)
+        {
+            numa_set_strict(1);
+//            numa_set_bind_policy(1);
+
+//        values          = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros);
+//        row_offsets     = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1));
+//        column_indices  = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros);
+
+            row_offsets     = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0);
+            column_indices  = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0);
+            values          = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1);
+        }
+        else
+        {
+            values          = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096);
+            row_offsets     = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096);
+            column_indices  = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096);
+
+        }
+
+#else
+        row_offsets     = new OffsetT[num_rows + 1];
+        column_indices  = new OffsetT[num_nonzeros];
+        values          = new ValueT[num_nonzeros];
+#endif
+
+        OffsetT prev_row = -1;
+        for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++)
+        {
+            OffsetT current_row = coo_matrix.coo_tuples[current_edge].row;
+
+            // Fill in rows up to and including the current row
+            for (OffsetT row = prev_row + 1; row <= current_row; row++)
+            {
+                row_offsets[row] = current_edge;
+            }
+            prev_row = current_row;
+
+            column_indices[current_edge]    = coo_matrix.coo_tuples[current_edge].col;
+            values[current_edge]            = coo_matrix.coo_tuples[current_edge].val;
+        }
+
+        // Fill out any trailing edgeless vertices (and the end-of-list element)
+        for (OffsetT row = prev_row + 1; row <= num_rows; row++)
+        {
+            row_offsets[row] = num_nonzeros;
+        }
+    }
+
+
+    /**
+     * Display log-histogram to stdout
+     */
+    void DisplayHistogram()
+    {
+        // Initialize
+        int log_counts[9];
+        for (int i = 0; i < 9; i++)
+        {
+            log_counts[i] = 0;
+        }
+
+        // Scan
+        int max_log_length = -1;
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            OffsetT length = row_offsets[row + 1] - row_offsets[row];
+
+            int log_length = -1;
+            while (length > 0)
+            {
+                length /= 10;
+                log_length++;
+            }
+            if (log_length > max_log_length)
+            {
+                max_log_length = log_length;
+            }
+
+            log_counts[log_length + 1]++;
+        }
+        printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros);
+        for (int i = -1; i < max_log_length + 1; i++)
+        {
+            printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols);
+        }
+        fflush(stdout);
+    }
+
+
+    /**
+     * Display matrix to stdout
+     */
+    void Display()
+    {
+        printf("Input Matrix:\n");
+        for (OffsetT row = 0; row < num_rows; row++)
+        {
+            printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]);
+            for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++)
+            {
+                printf("%d (%f), ", column_indices[current_edge], values[current_edge]);
+            }
+            printf("\n");
+        }
+        fflush(stdout);
+    }
+
+
+};
+
+
+
+/******************************************************************************
+ * Matrix transformations
+ ******************************************************************************/
+
+// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByLow
+{
+    OffsetT* row_degrees;
+    OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] < row_degrees[b])
+            return true;
+        else if (row_degrees[a] > row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+// Comparator for ordering rows by degree (highest first), then by row-id (lowest first)
+template <typename OffsetT>
+struct OrderByHigh
+{
+    OffsetT* row_degrees;
+    OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {}
+
+    bool operator()(const OffsetT &a, const OffsetT &b)
+    {
+        if (row_degrees[a] > row_degrees[b])
+            return true;
+        else if (row_degrees[a] < row_degrees[b])
+            return false;
+        else
+            return (a < b);
+    }
+};
+
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    OffsetT*                        relabel_indices)
+{
+    // Initialize row degrees
+    OffsetT* row_degrees_in     = new OffsetT[matrix.num_rows];
+    OffsetT* row_degrees_out    = new OffsetT[matrix.num_rows];
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        row_degrees_in[row]         = 0;
+        row_degrees_out[row]        = matrix.row_offsets[row + 1] - matrix.row_offsets[row];
+    }
+    for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero)
+    {
+        row_degrees_in[matrix.column_indices[nonzero]]++;
+    }
+
+    // Initialize unlabeled set 
+    typedef std::set<OffsetT, OrderByLow<OffsetT> > UnlabeledSet;
+    typename UnlabeledSet::key_compare  unlabeled_comp(row_degrees_in);
+    UnlabeledSet                        unlabeled(unlabeled_comp);
+    for (OffsetT row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row]    = -1;
+        unlabeled.insert(row);
+    }
+
+    // Initialize queue set
+    std::deque<OffsetT> q;
+
+    // Process unlabeled vertices (traverse connected components)
+    OffsetT relabel_idx = 0;
+    while (!unlabeled.empty())
+    {
+        // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree
+        OffsetT vertex = *unlabeled.begin();
+        q.push_back(vertex);
+
+        while (!q.empty())
+        {
+            vertex = q.front();
+            q.pop_front();
+
+            if (relabel_indices[vertex] == -1)
+            {
+                // Update this vertex
+                unlabeled.erase(vertex);
+                relabel_indices[vertex] = relabel_idx;
+                relabel_idx++;
+
+                // Sort neighbors by degree
+                OrderByLow<OffsetT> neighbor_comp(row_degrees_in);
+                std::sort(
+                    matrix.column_indices + matrix.row_offsets[vertex],
+                    matrix.column_indices + matrix.row_offsets[vertex + 1],
+                    neighbor_comp);
+
+                // Inspect neighbors, adding to the out frontier if unlabeled
+                for (OffsetT neighbor_idx = matrix.row_offsets[vertex];
+                    neighbor_idx < matrix.row_offsets[vertex + 1];
+                    ++neighbor_idx)
+                {
+                    OffsetT neighbor = matrix.column_indices[neighbor_idx];
+                    q.push_back(neighbor);
+                }
+            }
+        }
+    }
+
+/*
+    // Reverse labels
+    for (int row = 0; row < matrix.num_rows; ++row)
+    {
+        relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1;
+    }
+*/
+
+    // Cleanup
+    if (row_degrees_in) delete[] row_degrees_in;
+    if (row_degrees_out) delete[] row_degrees_out;
+}
+
+
+/**
+ * Reverse Cuthill-McKee
+ */
+template <typename ValueT, typename OffsetT>
+void RcmRelabel(
+    CsrMatrix<ValueT, OffsetT>&     matrix,
+    bool                            verbose = false)
+{
+    // Do not process if not square
+    if (matrix.num_cols != matrix.num_rows)
+    {
+        if (verbose) {
+            printf("RCM transformation ignored (not square)\n"); fflush(stdout);
+        }
+        return;
+    }
+
+    // Initialize relabel indices
+    OffsetT* relabel_indices = new OffsetT[matrix.num_rows];
+
+    if (verbose) {
+        printf("RCM relabeling... "); fflush(stdout);
+    }
+
+    RcmRelabel(matrix, relabel_indices);
+
+    if (verbose) {
+        printf("done. Reconstituting... "); fflush(stdout);
+    }
+
+    // Create a COO matrix from the relabel indices
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+    coo_matrix.InitCsrRelabel(matrix, relabel_indices);
+
+    // Reconstitute the CSR matrix from the sorted COO tuples
+    if (relabel_indices) delete[] relabel_indices;
+    matrix.Clear();
+    matrix.FromCoo(coo_matrix);
+
+    if (verbose) {
+        printf("done. "); fflush(stdout);
+    }
+}
+
+
+
+
diff --git a/external/cub/experimental/spmv_compare.cu b/external/cub/experimental/spmv_compare.cu
new file mode 100644
index 00000000000..59e07503b0e
--- /dev/null
+++ b/external/cub/experimental/spmv_compare.cu
@@ -0,0 +1,917 @@
+/******************************************************************************
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+//---------------------------------------------------------------------
+// SpMV comparison tool
+//---------------------------------------------------------------------
+
+#include <stdio.h>
+#include <map>
+#include <vector>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+
+#include <cusparse.h>
+
+#include "sparse_matrix.h"
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <cub/device/device_spmv.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <test/test_util.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants, and type declarations
+//---------------------------------------------------------------------
+
+bool                    g_quiet     = false;        // Whether to display stats in CSV format
+bool                    g_verbose   = false;        // Whether to display output to console
+bool                    g_verbose2  = false;        // Whether to display input to console
+CachingDeviceAllocator  g_allocator(true);          // Caching allocator for device memory
+
+
+//---------------------------------------------------------------------
+// SpMV verification
+//---------------------------------------------------------------------
+
+// Compute reference SpMV y = Ax
+template <
+    typename ValueT,
+    typename OffsetT>
+void SpmvGold(
+    CsrMatrix<ValueT, OffsetT>&     a,
+    ValueT*                         vector_x,
+    ValueT*                         vector_y_in,
+    ValueT*                         vector_y_out,
+    ValueT                          alpha,
+    ValueT                          beta)
+{
+    for (OffsetT row = 0; row < a.num_rows; ++row)
+    {
+        ValueT partial = beta * vector_y_in[row];
+        for (
+            OffsetT offset = a.row_offsets[row];
+            offset < a.row_offsets[row + 1];
+            ++offset)
+        {
+            partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]];
+        }
+        vector_y_out[row] = partial;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// GPU I/O proxy
+//---------------------------------------------------------------------
+
+/**
+ * Read every matrix nonzero value, read every corresponding vector value
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ValueT,
+    typename    OffsetT,
+    typename    VectorItr>
+__launch_bounds__ (int(BLOCK_THREADS))
+__global__ void NonZeroIoKernel(
+    SpmvParams<ValueT, OffsetT> params,
+    VectorItr                   d_vector_x)
+{
+    enum
+    {
+        TILE_ITEMS      = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+
+    ValueT nonzero = 0.0;
+
+    int tile_idx = blockIdx.x;
+
+    OffsetT block_offset = tile_idx * TILE_ITEMS;
+
+    OffsetT column_indices[ITEMS_PER_THREAD];
+    ValueT values[ITEMS_PER_THREAD];
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+
+        OffsetT* ci = params.d_column_indices + nonzero_idx;
+        ValueT*a = params.d_values + nonzero_idx;
+
+        column_indices[ITEM]    = (nonzero_idx < params.num_nonzeros) ? *ci : 0;
+        values[ITEM]            = (nonzero_idx < params.num_nonzeros) ? *a : 0.0;
+    }
+
+    __syncthreads();
+
+    // Read vector
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        ValueT vector_value    = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]);
+        nonzero                += vector_value * values[ITEM];
+    }
+
+    __syncthreads();
+
+    if (block_offset < params.num_rows)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
+            if (row_idx < params.num_rows)
+            {
+                OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx);
+
+                if ((row_end_offset >= 0) && (nonzero == nonzero))
+                    params.d_vector_y[row_idx] = nonzero;
+            }
+        }
+    }
+
+}
+
+
+/**
+ * Run GPU I/O proxy
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuCsrIoProxy(
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    enum {
+        BLOCK_THREADS       = 128,
+        ITEMS_PER_THREAD    = 7,
+        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+//    size_t smem = 1024 * 16;
+    size_t smem = 1024 * 0;
+
+    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
+    unsigned int blocks = std::max(nonzero_blocks, row_blocks);
+
+    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
+    TexItr x_itr;
+    CubDebugExit(x_itr.BindTexture(params.d_vector_x));
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
+
+
+    int spmv_sm_occupancy;
+    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
+
+    if (!g_quiet)
+        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
+
+    // Warmup
+    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+
+    // Check for failures
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(SyncStream(0));
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+    timer.Start();
+    for (int it = 0; it < timing_iterations; ++it)
+    {
+        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    CubDebugExit(x_itr.UnbindTexture());
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse HybMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    cusparseStatus_t status = cusparseScsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO);
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse HYB SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseHybmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    CpuTimer cpu_timer;
+    cpu_timer.Start();
+
+    // Construct Hyb matrix
+    cusparseMatDescr_t mat_desc;
+    cusparseHybMat_t hyb_desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
+        cusparse,
+        params.num_rows, params.num_cols,
+        mat_desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        hyb_desc,
+        0,
+        CUSPARSE_HYB_PARTITION_AUTO));
+
+    cudaDeviceSynchronize();
+    cpu_timer.Stop();
+    float elapsed_millis = cpu_timer.ElapsedMillis();
+    printf("HYB setup ms, %.5f, ", elapsed_millis);
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+        cusparse,
+        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        &params.alpha, mat_desc,
+        hyb_desc,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
+            cusparse,
+            CUSPARSE_OPERATION_NON_TRANSPOSE,
+            &params.alpha, mat_desc,
+            hyb_desc,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    // Cleanup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
+
+    return elapsed_millis / timing_iterations;
+}
+
+
+
+//---------------------------------------------------------------------
+// cuSparse CsrMV
+//---------------------------------------------------------------------
+
+/**
+ * Run cuSparse SpMV (specialized for fp32)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    float*                          vector_y_in,
+    float*                          reference_vector_y_out,
+    SpmvParams<float, OffsetT>&     params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis    = 0.0;
+    GpuTimer timer;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+
+/**
+ * Run cuSparse SpMV (specialized for fp64)
+ */
+template <
+    typename OffsetT>
+float TestCusparseCsrmv(
+    double*                         vector_y_in,
+    double*                         reference_vector_y_out,
+    SpmvParams<double, OffsetT>&    params,
+    int                             timing_iterations,
+    cusparseHandle_t                cusparse)
+{
+    cusparseMatDescr_t desc;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, &params.beta, params.d_vector_y));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    float elapsed_millis = 0.0;
+    GpuTimer timer;
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
+            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, &params.beta, params.d_vector_y));
+
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// GPU Merge-based SpMV
+//---------------------------------------------------------------------
+
+/**
+ * Run CUB SpMV
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+float TestGpuMergeCsrmv(
+    ValueT*                         vector_y_in,
+    ValueT*                         reference_vector_y_out,
+    SpmvParams<ValueT, OffsetT>&    params,
+    int                             timing_iterations)
+{
+    // Allocate temporary storage
+    size_t temp_storage_bytes = 0;
+    void *d_temp_storage = NULL;
+
+    // Get amount of temporary storage needed
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros,
+// params.alpha, params.beta,
+        (cudaStream_t) 0, false));
+
+    // Allocate
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Reset input/output vector y
+    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
+
+    // Warmup
+    CubDebugExit(DeviceSpmv::CsrMV(
+        d_temp_storage, temp_storage_bytes,
+        params.d_values, params.d_row_end_offsets, params.d_column_indices,
+        params.d_vector_x, params.d_vector_y,
+        params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+        (cudaStream_t) 0, !g_quiet));
+
+    if (!g_quiet)
+    {
+        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
+        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Timing
+    GpuTimer timer;
+    float elapsed_millis = 0.0;
+
+    timer.Start();
+    for(int it = 0; it < timing_iterations; ++it)
+    {
+        CubDebugExit(DeviceSpmv::CsrMV(
+            d_temp_storage, temp_storage_bytes,
+            params.d_values, params.d_row_end_offsets, params.d_column_indices,
+            params.d_vector_x, params.d_vector_y,
+            params.num_rows, params.num_cols, params.num_nonzeros, 
+// params.alpha, params.beta,
+            (cudaStream_t) 0, false));
+    }
+    timer.Stop();
+    elapsed_millis += timer.ElapsedMillis();
+
+    return elapsed_millis / timing_iterations;
+}
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+/**
+ * Display perf
+ */
+template <typename ValueT, typename OffsetT>
+void DisplayPerf(
+    float                           device_giga_bandwidth,
+    double                          avg_millis,
+    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
+{
+    double nz_throughput, effective_bandwidth;
+    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
+        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
+
+    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
+    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
+
+    if (!g_quiet)
+        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
+            sizeof(ValueT) * 8,
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+    else
+        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
+            avg_millis,
+            2 * nz_throughput,
+            effective_bandwidth,
+            effective_bandwidth / device_giga_bandwidth * 100);
+
+    fflush(stdout);
+}
+
+
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTest(
+    bool                        rcm_relabel,
+    ValueT                      alpha,
+    ValueT                      beta,
+    CooMatrix<ValueT, OffsetT>& coo_matrix,
+    int                         timing_iterations,
+    CommandLineArgs&            args)
+{
+    // Adaptive timing iterations: run 16 billion nonzeros through
+    if (timing_iterations == -1)
+        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
+
+    if (!g_quiet)
+        printf("\t%d timing iterations\n", timing_iterations);
+
+    // Convert to CSR
+    CsrMatrix<ValueT, OffsetT> csr_matrix;
+    csr_matrix.FromCoo(coo_matrix);
+    if (!args.CheckCmdLineFlag("csrmv"))
+        coo_matrix.Clear();
+
+    // Relabel
+    if (rcm_relabel)
+    {
+        if (!g_quiet)
+        {
+            csr_matrix.Stats().Display();
+            printf("\n");
+            csr_matrix.DisplayHistogram();
+            printf("\n");
+            if (g_verbose2)
+                csr_matrix.Display();
+            printf("\n");
+        }
+
+        RcmRelabel(csr_matrix, !g_quiet);
+
+        if (!g_quiet) printf("\n");
+    }
+
+    // Display matrix info
+    csr_matrix.Stats().Display(!g_quiet);
+    if (!g_quiet)
+    {
+        printf("\n");
+        csr_matrix.DisplayHistogram();
+        printf("\n");
+        if (g_verbose2)
+            csr_matrix.Display();
+        printf("\n");
+    }
+    fflush(stdout);
+
+    // Allocate input and output vectors
+    ValueT* vector_x        = new ValueT[csr_matrix.num_cols];
+    ValueT* vector_y_in     = new ValueT[csr_matrix.num_rows];
+    ValueT* vector_y_out    = new ValueT[csr_matrix.num_rows];
+
+    for (int col = 0; col < csr_matrix.num_cols; ++col)
+        vector_x[col] = 1.0;
+
+    for (int row = 0; row < csr_matrix.num_rows; ++row)
+        vector_y_in[row] = 1.0;
+
+    // Compute reference answer
+    SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta);
+
+    float avg_millis;
+
+    if (g_quiet) {
+        printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout);
+    }
+
+    // Get GPU device bandwidth (GB/s)
+    float device_giga_bandwidth = args.device_giga_bandwidth;
+
+    // Allocate and initialize GPU problem
+    SpmvParams<ValueT, OffsetT> params;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_values,          sizeof(ValueT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_x,        sizeof(ValueT) * csr_matrix.num_cols));
+    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_y,        sizeof(ValueT) * csr_matrix.num_rows));
+    params.num_rows         = csr_matrix.num_rows;
+    params.num_cols         = csr_matrix.num_cols;
+    params.num_nonzeros     = csr_matrix.num_nonzeros;
+    params.alpha            = alpha;
+    params.beta             = beta;
+
+    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
+
+    if (!g_quiet) printf("\n\n");
+    printf("GPU CSR I/O Prox, "); fflush(stdout);
+    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
+    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("CUB, "); fflush(stdout);
+        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    // Initialize cuSparse
+    cusparseHandle_t cusparse;
+    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
+
+    if (args.CheckCmdLineFlag("csrmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse CsrMV, "); fflush(stdout);
+        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+    if (args.CheckCmdLineFlag("hybmv"))
+    {
+        if (!g_quiet) printf("\n\n");
+        printf("Cusparse HybMV, "); fflush(stdout);
+
+        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
+        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
+    }
+
+
+    // Cleanup
+    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
+    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
+    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
+    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
+    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
+
+    if (vector_x)                   delete[] vector_x;
+    if (vector_y_in)                delete[] vector_y_in;
+    if (vector_y_out)               delete[] vector_y_out;
+}
+
+/**
+ * Run tests
+ */
+template <
+    typename ValueT,
+    typename OffsetT>
+void RunTests(
+    bool                rcm_relabel,
+    ValueT              alpha,
+    ValueT              beta,
+    const std::string&  mtx_filename,
+    int                 grid2d,
+    int                 grid3d,
+    int                 wheel,
+    int                 dense,
+    int                 timing_iterations,
+    CommandLineArgs&    args)
+{
+    // Initialize matrix in COO form
+    CooMatrix<ValueT, OffsetT> coo_matrix;
+
+    if (!mtx_filename.empty())
+    {
+        // Parse matrix market file
+        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
+        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
+
+        if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
+        {
+            if (!g_quiet) printf("Trivial dataset\n");
+            exit(0);
+        }
+    }
+    else if (grid2d > 0)
+    {
+        // Generate 2D lattice
+        printf("grid2d_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitGrid2d(grid2d, false);
+    }
+    else if (grid3d > 0)
+    {
+        // Generate 3D lattice
+        printf("grid3d_%d, ", grid3d); fflush(stdout);
+        coo_matrix.InitGrid3d(grid3d, false);
+    }
+    else if (wheel > 0)
+    {
+        // Generate wheel graph
+        printf("wheel_%d, ", grid2d); fflush(stdout);
+        coo_matrix.InitWheel(wheel);
+    }
+    else if (dense > 0)
+    {
+        // Generate dense graph
+        OffsetT size = 1 << 24; // 16M nnz
+        args.GetCmdLineArgument("size", size);
+
+        OffsetT rows = size / dense;
+        printf("dense_%d_x_%d, ", rows, dense); fflush(stdout);
+        coo_matrix.InitDense(rows, dense);
+    }
+    else
+    {
+        fprintf(stderr, "No graph type specified.\n");
+        exit(1);
+    }
+
+    RunTest(
+        rcm_relabel,
+        alpha,
+        beta,
+        coo_matrix,
+        timing_iterations,
+        args);
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char **argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf(
+            "%s "
+            "[--csrmv | --hybmv | --bsrmv ] "
+            "[--device=<device-id>] "
+            "[--quiet] "
+            "[--v] "
+            "[--i=<timing iterations>] "
+            "[--fp64] "
+            "[--rcm] "
+            "[--alpha=<alpha scalar (default: 1.0)>] "
+            "[--beta=<beta scalar (default: 0.0)>] "
+            "\n\t"
+                "--mtx=<matrix market file> "
+            "\n\t"
+                "--dense=<cols>"
+            "\n\t"
+                "--grid2d=<width>"
+            "\n\t"
+                "--grid3d=<width>"
+            "\n\t"
+                "--wheel=<spokes>"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    bool                fp64;
+    bool                rcm_relabel;
+    std::string         mtx_filename;
+    int                 grid2d              = -1;
+    int                 grid3d              = -1;
+    int                 wheel               = -1;
+    int                 dense               = -1;
+    int                 timing_iterations   = -1;
+    float               alpha               = 1.0;
+    float               beta                = 0.0;
+
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose2 = args.CheckCmdLineFlag("v2");
+    g_quiet = args.CheckCmdLineFlag("quiet");
+    fp64 = args.CheckCmdLineFlag("fp64");
+    rcm_relabel = args.CheckCmdLineFlag("rcm");
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("mtx", mtx_filename);
+    args.GetCmdLineArgument("grid2d", grid2d);
+    args.GetCmdLineArgument("grid3d", grid3d);
+    args.GetCmdLineArgument("wheel", wheel);
+    args.GetCmdLineArgument("dense", dense);
+    args.GetCmdLineArgument("alpha", alpha);
+    args.GetCmdLineArgument("beta", beta);
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Run test(s)
+    if (fp64)
+    {
+        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+    else
+    {
+        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
+    }
+
+    CubDebugExit(cudaDeviceSynchronize());
+    printf("\n");
+
+    return 0;
+}
diff --git a/external/cub/experimental/spmv_script.sh b/external/cub/experimental/spmv_script.sh
new file mode 100755
index 00000000000..f43204315a3
--- /dev/null
+++ b/external/cub/experimental/spmv_script.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
+do
+	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
+done
+
+echo
+echo
+
+for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
+do
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done
+
+echo
+echo
+
+for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
+#for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
+do 
+    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
+    then
+    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
+    fi
+done 
+
diff --git a/external/cub/test/Makefile b/external/cub/test/Makefile
new file mode 100644
index 00000000000..26d253594f2
--- /dev/null
+++ b/external/cub/test/Makefile
@@ -0,0 +1,453 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+#
+# Makefile usage
+#
+# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>] [quickertest=<0|1>]
+#
+#-------------------------------------------------------------------------------
+
+include ../common.mk 
+ 
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# Testing mode option (quick/thorough)
+ifeq ($(quickertest), 1)
+	NVCCFLAGS += -DQUICKER_TEST
+	TEST_SUFFIX = quicker
+else ifeq ($(quicktest), 1)
+	NVCCFLAGS += -DQUICK_TEST
+	TEST_SUFFIX = quick
+else 
+	TEST_SUFFIX = thorough
+	NPPI = 
+endif
+
+
+# CUDA memcheck (enabled by default) 
+ifeq ($(memcheck), 0)
+	MEMCHECK = 
+else 
+	MEMCHECK = cuda-memcheck
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+# Includes
+INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
+
+# Suffix to append to each binary
+SUFFIX = $(BIN_SUFFIX)_$(TEST_SUFFIX)
+
+# Define test arch
+DEFINES += -DTEST_ARCH=$(TEST_ARCH)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =				$(CUB_DEPS) \
+					$(CUB_DIR)test/Makefile \
+					$(CUB_DIR)test/test_util.h \
+					$(CUB_DIR)test/mersenne.h \
+
+BLOCK_REDUCE = 		test_block_reduce_raking \
+	 				test_block_reduce_warp_reductions		
+
+
+BLOCK_SCAN = 		test_block_scan_raking \
+	 				test_block_scan_raking_memoize \
+	 				test_block_scan_warp_scans		
+
+
+BLOCK_RADIX_SORT = 	test_block_radix_sort_keys \
+	 				test_block_radix_sort_pairs	
+
+		
+ALL = 				link \
+	 				test_iterator \
+	 				test_allocator \
+	 				test_warp_scan \
+	 				test_warp_reduce \
+	 				$(BLOCK_REDUCE) \
+	 				$(BLOCK_SCAN) \
+	 				$(BLOCK_RADIX_SORT) \
+	 				test_block_load_store \
+	 				test_block_histogram \
+				 	test_device_reduce \
+			 		test_device_histogram \
+			 		test_device_scan \
+			 		test_device_radix_sort \
+					test_device_reduce_by_key\
+					test_device_run_length_encode\
+		 			test_device_select_unique \
+					test_device_select_if 
+		
+#	 	test_grid_barrier \		fails on sm110
+#	 	test_device_seg_reduce
+		
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+#-------------------------------------------------------------------------------
+# make all
+#-------------------------------------------------------------------------------
+
+all : $(ALL)
+
+
+#-------------------------------------------------------------------------------
+# make run
+#-------------------------------------------------------------------------------
+
+run : 
+	for i in $(ALL); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_reduce : 
+	for i in $(BLOCK_REDUCE); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_scan : 
+	for i in $(BLOCK_SCAN); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+run_block_radix_sort : 
+	for i in $(BLOCK_RADIX_SORT); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
+
+
+
+#-------------------------------------------------------------------------------
+# make link
+#-------------------------------------------------------------------------------
+
+link : bin/link_$(SUFFIX)
+
+bin/link_$(SUFFIX) : link_a.cu link_b.cu link_main.cpp $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_a.cu -c -o bin/link_a.obj
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_b.cu -c -o bin/link_b.obj
+	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_main.cpp bin/link_a.obj bin/link_b.obj -o bin/link_$(SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# make test_iterator 
+#-------------------------------------------------------------------------------
+
+test_iterator: bin/test_iterator_$(SUFFIX)
+
+bin/test_iterator_$(SUFFIX) : test_iterator.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_iterator_$(SUFFIX) test_iterator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_allocator 
+#-------------------------------------------------------------------------------
+
+test_allocator: bin/test_allocator_$(SUFFIX)
+
+bin/test_allocator_$(SUFFIX) : test_allocator.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_allocator_$(SUFFIX) test_allocator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+	
+#-------------------------------------------------------------------------------
+# make test_grid_barrier 
+#-------------------------------------------------------------------------------
+
+test_grid_barrier: bin/test_grid_barrier_$(SUFFIX)
+
+bin/test_grid_barrier_$(SUFFIX) : test_grid_barrier.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_grid_barrier_$(SUFFIX) test_grid_barrier.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+	
+
+#-------------------------------------------------------------------------------
+# make test_warp_scan 
+#-------------------------------------------------------------------------------
+
+test_warp_scan: bin/test_warp_scan_$(SUFFIX)
+
+bin/test_warp_scan_$(SUFFIX) : test_warp_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_scan_$(SUFFIX) test_warp_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_warp_reduce 
+#-------------------------------------------------------------------------------
+
+test_warp_reduce: bin/test_warp_reduce_$(SUFFIX)
+
+bin/test_warp_reduce_$(SUFFIX) : test_warp_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_reduce_$(SUFFIX) test_warp_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce_raking
+#-------------------------------------------------------------------------------
+
+test_block_reduce_raking: bin/test_block_reduce_raking_$(SUFFIX)
+
+bin/test_block_reduce_raking_$(SUFFIX) : test_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_reduce_raking_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce_warp_reductions 
+#-------------------------------------------------------------------------------
+
+test_block_reduce_warp_reductions: bin/test_block_reduce_warp_reductions_$(SUFFIX)
+
+bin/test_block_reduce_warp_reductions_$(SUFFIX) : test_block_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_WARP_REDUCTIONS $(SM_TARGETS) -o bin/test_block_reduce_warp_reductions_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_reduce 
+#-------------------------------------------------------------------------------
+
+test_block_reduce: $(BLOCK_REDUCE)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_raking
+#-------------------------------------------------------------------------------
+
+test_block_scan_raking: bin/test_block_scan_raking_$(SUFFIX)
+
+bin/test_block_scan_raking_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_scan_raking_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_raking_memoize
+#-------------------------------------------------------------------------------
+
+test_block_scan_raking_memoize: bin/test_block_scan_raking_memoize_$(SUFFIX)
+
+bin/test_block_scan_raking_memoize_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_RAKING_MEMOIZE $(SM_TARGETS) -o bin/test_block_scan_raking_memoize_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan_warp_scans
+#-------------------------------------------------------------------------------
+
+test_block_scan_warp_scans: bin/test_block_scan_warp_scans_$(SUFFIX)
+
+bin/test_block_scan_warp_scans_$(SUFFIX) : test_block_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_WARP_SCANS $(SM_TARGETS) -o bin/test_block_scan_warp_scans_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
+
+
+#-------------------------------------------------------------------------------
+# make test_block_scan 
+#-------------------------------------------------------------------------------
+
+test_block_scan: $(BLOCK_SCAN)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_load_store 
+#-------------------------------------------------------------------------------
+
+test_block_load_store: bin/test_block_load_store_$(SUFFIX)
+
+bin/test_block_load_store_$(SUFFIX) : test_block_load_store.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_load_store_$(SUFFIX) test_block_load_store.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+	
+	
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort_keys 
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort_keys: bin/test_block_radix_sort_keys_$(SUFFIX)
+
+bin/test_block_radix_sort_keys_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) -DTEST_KEYS_ONLY $(SM_TARGETS) -o bin/test_block_radix_sort_keys_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort_pairs 
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort_pairs: bin/test_block_radix_sort_pairs_$(SUFFIX)
+
+bin/test_block_radix_sort_pairs_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_radix_sort_pairs_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_block_radix_sort
+#-------------------------------------------------------------------------------
+
+test_block_radix_sort : $(BLOCK_RADIX_SORT)
+
+
+#-------------------------------------------------------------------------------
+# make test_block_histogram 
+#-------------------------------------------------------------------------------
+
+test_block_histogram: bin/test_block_histogram_$(SUFFIX)
+
+bin/test_block_histogram_$(SUFFIX) : test_block_histogram.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_histogram_$(SUFFIX) test_block_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_reduce
+#-------------------------------------------------------------------------------
+
+test_device_reduce: bin/test_device_reduce_$(SUFFIX)
+
+bin/test_device_reduce_$(SUFFIX) : test_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_$(SUFFIX) test_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_histogram
+#-------------------------------------------------------------------------------
+
+test_device_histogram: bin/test_device_histogram_$(SUFFIX)
+
+bin/test_device_histogram_$(SUFFIX) : test_device_histogram.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_histogram_$(SUFFIX) test_device_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) $(NPPI) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_scan
+#-------------------------------------------------------------------------------
+
+test_device_scan: bin/test_device_scan_$(SUFFIX)
+
+bin/test_device_scan_$(SUFFIX) : test_device_scan.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_scan_$(SUFFIX) test_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_radix_sort
+#-------------------------------------------------------------------------------
+
+test_device_radix_sort: bin/test_device_radix_sort_$(SUFFIX)
+
+bin/test_device_radix_sort_$(SUFFIX) : test_device_radix_sort.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_radix_sort_$(SUFFIX) test_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_select_unique
+#-------------------------------------------------------------------------------
+
+test_device_select_unique: bin/test_device_select_unique_$(SUFFIX)
+
+bin/test_device_select_unique_$(SUFFIX) : test_device_select_unique.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_unique_$(SUFFIX) test_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+#-------------------------------------------------------------------------------
+# make test_device_select_if
+#-------------------------------------------------------------------------------
+
+test_device_select_if: bin/test_device_select_if_$(SUFFIX)
+
+bin/test_device_select_if_$(SUFFIX) : test_device_select_if.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_if_$(SUFFIX) test_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_device_reduce_by_key
+#-------------------------------------------------------------------------------
+
+test_device_reduce_by_key: bin/test_device_reduce_by_key_$(SUFFIX)
+
+bin/test_device_reduce_by_key_$(SUFFIX) : test_device_reduce_by_key.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_by_key_$(SUFFIX) test_device_reduce_by_key.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+#-------------------------------------------------------------------------------
+# make test_device_run_length_encode
+#-------------------------------------------------------------------------------
+
+test_device_run_length_encode: bin/test_device_run_length_encode_$(SUFFIX)
+
+bin/test_device_run_length_encode_$(SUFFIX) : test_device_run_length_encode.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_run_length_encode_$(SUFFIX) test_device_run_length_encode.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
+
+
+#-------------------------------------------------------------------------------
+# make test_device_seg_reduce
+#-------------------------------------------------------------------------------
+#
+#test_device_seg_reduce: bin/test_device_seg_reduce_$(SUFFIX)
+#
+#bin/test_device_seg_reduce_$(SUFFIX) : test_device_seg_reduce.cu $(DEPS)
+#	mkdir -p bin
+#	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_seg_reduce_$(SUFFIX) test_device_seg_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
+
+
diff --git a/external/cub/test/link_a.cu b/external/cub/test/link_a.cu
new file mode 100644
index 00000000000..8a9b19f93d8
--- /dev/null
+++ b/external/cub/test/link_a.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void a()
+{
+    printf("a() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/external/cub/test/link_b.cu b/external/cub/test/link_b.cu
new file mode 100644
index 00000000000..a19ec407d90
--- /dev/null
+++ b/external/cub/test/link_b.cu
@@ -0,0 +1,11 @@
+#include <cub/cub.cuh>
+
+void b()
+{
+    printf("b() called\n");
+
+    cub::DoubleBuffer<unsigned int>     d_keys;
+    cub::DoubleBuffer<cub::NullType>    d_values;
+    size_t                              temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
+}
diff --git a/external/cub/test/link_main.cpp b/external/cub/test/link_main.cpp
new file mode 100644
index 00000000000..ef677ee03b4
--- /dev/null
+++ b/external/cub/test/link_main.cpp
@@ -0,0 +1,10 @@
+#include <stdio.h>
+
+extern void a();
+extern void b();
+
+int main()
+{
+    printf("hello world\n");
+    return 0;
+}
diff --git a/external/cub/test/mersenne.h b/external/cub/test/mersenne.h
new file mode 100644
index 00000000000..76aae809d08
--- /dev/null
+++ b/external/cub/test/mersenne.h
@@ -0,0 +1,160 @@
+/*
+ A C-program for MT19937, with initialization improved 2002/1/26.
+ Coded by Takuji Nishimura and Makoto Matsumoto.
+
+ Before using, initialize the state by using init_genrand(seed)
+ or init_by_array(init_key, key_length).
+
+ Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. The names of its contributors may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+ Any feedback is very welcome.
+ http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
+ email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
+ */
+
+#include <stdio.h>
+
+namespace mersenne {
+
+/* Period parameters */
+const unsigned int N          = 624;
+const unsigned int M          = 397;
+const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
+const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
+const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
+
+static unsigned int mt[N];  /* the array for the state vector  */
+static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
+
+/* initializes mt[N] with a seed */
+void init_genrand(unsigned int s)
+{
+    mt[0] = s & 0xffffffff;
+    for (mti = 1; mti < N; mti++)
+    {
+        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
+
+        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
+        /* In the previous versions, MSBs of the seed affect   */
+        /* only MSBs of the array mt[].                        */
+        /* 2002/01/09 modified by Makoto Matsumoto             */
+
+        mt[mti] &= 0xffffffff;
+        /* for >32 bit machines */
+    }
+}
+
+/* initialize by an array with array-length */
+/* init_key is the array for initializing keys */
+/* key_length is its length */
+/* slight change for C++, 2004/2/26 */
+void init_by_array(unsigned int init_key[], int key_length)
+{
+    int i, j, k;
+    init_genrand(19650218);
+    i = 1;
+    j = 0;
+    k = (N > key_length ? N : key_length);
+    for (; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
+            + init_key[j] + j;  /* non linear */
+        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
+        i++;
+        j++;
+        if (i >= N)
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+        if (j >= key_length) j = 0;
+    }
+    for (k = N - 1; k; k--)
+    {
+        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
+        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
+        i++;
+        if (i >= N)
+        {
+            mt[0] = mt[N - 1];
+            i = 1;
+        }
+    }
+
+    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
+}
+
+/* generates a random number on [0,0xffffffff]-interval */
+unsigned int genrand_int32(void)
+{
+    unsigned int y;
+    static unsigned int mag01[2] = { 0x0, MATRIX_A };
+
+    /* mag01[x] = x * MATRIX_A  for x=0,1 */
+
+    if (mti >= N)
+    { /* generate N words at one time */
+        int kk;
+
+        if (mti == N + 1) /* if init_genrand() has not been called, */
+        init_genrand(5489); /* a defat initial seed is used */
+
+        for (kk = 0; kk < N - M; kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        for (; kk < N - 1; kk++)
+        {
+            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
+            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
+        }
+        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
+        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
+
+        mti = 0;
+    }
+
+    y = mt[mti++];
+
+    /* Tempering */
+    y ^= (y >> 11);
+    y ^= (y << 7) & 0x9d2c5680;
+    y ^= (y << 15) & 0xefc60000;
+    y ^= (y >> 18);
+
+    return y;
+}
+
+
+
+} // namespace mersenne
diff --git a/external/cub/test/test_allocator.cu b/external/cub/test/test_allocator.cu
new file mode 100644
index 00000000000..8176db68be0
--- /dev/null
+++ b/external/cub/test/test_allocator.cu
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for caching allocator of device memory
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/util_allocator.cuh>
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--bytes=<timing bytes>]"
+            "[--i=<timing iterations>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+#if (CUB_PTX_ARCH == 0)
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get number of GPUs and current GPU
+    int num_gpus;
+    int initial_gpu;
+    int timing_iterations           = 10000;
+    int timing_bytes                = 1024 * 1024;
+
+    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
+    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
+    args.GetCmdLineArgument("i", timing_iterations);
+    args.GetCmdLineArgument("bytes", timing_bytes);
+
+    // Create default allocator (caches up to 6MB in device allocations per GPU)
+    CachingDeviceAllocator allocator;
+    allocator.debug = true;
+
+    printf("Running single-gpu tests...\n"); fflush(stdout);
+
+    //
+    // Test0
+    //
+
+    // Create a new stream
+    cudaStream_t other_stream;
+    CubDebugExit(cudaStreamCreate(&other_stream));
+
+    // Allocate 999 bytes on the current gpu in stream0
+    char *d_999B_stream0_a;
+    char *d_999B_stream0_b;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_a
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+
+    // Allocate another 999 bytes in stream 0
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in stream 0
+    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
+
+    // Free d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Allocate 999 bytes on the current gpu in other_stream
+    char *d_999B_stream_other_a;
+    char *d_999B_stream_other_b;
+    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
+
+    // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have one cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Check that we can now use both allocations in other_stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Run some big kernel in other_stream
+    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
+
+    // Free d_999B_stream_other_a and d_999B_stream_other_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
+
+    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
+    CubDebugExit(cudaDeviceSynchronize());
+    CubDebugExit(cudaStreamDestroy(other_stream));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    // Check that that we have no cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Free d_999B_stream0_a and d_999B_stream0_b
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
+    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
+
+    // Free all cached
+    CubDebugExit(allocator.FreeAllCached());
+
+    //
+    // Test1
+    //
+
+    // Allocate 5 bytes on the current gpu
+    char *d_5B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
+
+    // Check that that we have zero free bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test2
+    //
+
+    // Allocate 4096 bytes on the current gpu
+    char *d_4096B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
+
+    // Check that that we have 2 live blocks on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 2);
+
+    //
+    // Test3
+    //
+
+    // DeviceFree d_5B
+    CubDebugExit(allocator.DeviceFree(d_5B));
+
+    // Check that that we have min_bin_bytes free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test4
+    //
+
+    // DeviceFree d_4096B
+    CubDebugExit(allocator.DeviceFree(d_4096B));
+
+    // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
+
+    // Check that that we have 0 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+    // Check that that we have 2 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 2);
+
+    //
+    // Test5
+    //
+
+    // Allocate 768 bytes on the current gpu
+    char *d_768B;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test6
+    //
+
+    // Allocate max_cached_bytes on the current gpu
+    char *d_max_cached;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
+
+    // DeviceFree d_max_cached
+    CubDebugExit(allocator.DeviceFree(d_max_cached));
+
+    // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
+
+    // Check that that we have 1 live block on the initial GPU
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    // Check that that we still have 1 cached block on the initial GPU
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    //
+    // Test7
+    //
+
+    // Free all cached blocks on all GPUs
+    CubDebugExit(allocator.FreeAllCached());
+
+    // Check that that we have 0 bytes cached on the initial GPU
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
+
+    // Check that that we have 0 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 0);
+
+    // Check that that still we have 1 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 1);
+
+    //
+    // Test8
+    //
+
+    // Allocate max cached bytes + 1 on the current gpu
+    char *d_max_cached_plus;
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
+
+    // DeviceFree max cached bytes
+    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
+
+    // DeviceFree d_768B
+    CubDebugExit(allocator.DeviceFree(d_768B));
+
+    unsigned int power;
+    size_t rounded_bytes;
+    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
+
+    // Check that that we have 4096 free bytes cached on the initial gpu
+    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+    // Check that that we have 1 cached blocks across all GPUs
+    AssertEquals(allocator.cached_blocks.size(), 1);
+
+    // Check that that still we have 0 live block across all GPUs
+    AssertEquals(allocator.live_blocks.size(), 0);
+
+#ifndef CUB_CDP
+    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
+
+    if (num_gpus > 1)
+    {
+        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
+
+        //
+        // Test9
+        //
+
+        // Allocate 768 bytes on the next gpu
+        int next_gpu = (initial_gpu + 1) % num_gpus;
+        char *d_768B_2;
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // DeviceFree d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Re-allocate 768 bytes on the next gpu
+        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
+
+        // Re-free d_768B on the next gpu
+        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
+
+        // Check that that we have 4096 free bytes cached on the initial gpu
+        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
+
+        // Check that that we have 4096 free bytes cached on the second gpu
+        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
+
+        // Check that that we have 2 cached blocks across all GPUs
+        AssertEquals(allocator.cached_blocks.size(), 2);
+
+        // Check that that still we have 0 live block across all GPUs
+        AssertEquals(allocator.live_blocks.size(), 0);
+    }
+#endif  // CUB_CDP
+
+    //
+    // Performance
+    //
+
+    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
+    CpuTimer    cpu_timer;
+    char        *d_1024MB                       = NULL;
+    allocator.debug                             = false;
+
+    // Prime the caching allocator and the kernel
+    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+    CubDebugExit(allocator.DeviceFree(d_1024MB));
+    cub::EmptyKernel<void><<<1, 32>>>();
+
+    // CUDA
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    // CUB
+    cpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    cpu_timer.Stop();
+    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
+
+    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
+    GpuTimer gpu_timer;
+
+    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
+    fflush(stdout); fflush(stderr);
+
+    // Kernel-only
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        cub::EmptyKernel<void><<<1, 32>>>();
+    }
+    gpu_timer.Stop();
+    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // CUDA
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(cudaFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    // CUB
+    gpu_timer.Start();
+    for (int i = 0; i < timing_iterations; ++i)
+    {
+        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
+        cub::EmptyKernel<void><<<1, 32>>>();
+        CubDebugExit(allocator.DeviceFree(d_1024MB));
+    }
+    gpu_timer.Stop();
+    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
+
+    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
+        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
+        cuda_malloc_elapsed_millis / timing_iterations,
+        cub_calloc_elapsed_millis / timing_iterations);
+
+
+#endif
+
+    printf("Success\n");
+
+    return 0;
+}
+
diff --git a/external/cub/test/test_block_histogram.cu b/external/cub/test/test_block_histogram.cu
new file mode 100644
index 00000000000..1b61341d984
--- /dev/null
+++ b/external/cub/test/test_block_histogram.cu
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <string>
+#include <typeinfo>
+
+#include <cub/block/block_histogram.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockHistogram test kernel.
+ */
+template <
+    int                     BINS,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm ALGORITHM,
+    typename                T,
+    typename                HistoCounter>
+__global__ void BlockHistogramKernel(
+    T                       *d_samples,
+    HistoCounter            *d_histogram)
+{
+    // Parameterize BlockHistogram type for our thread block
+    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockHistogram::TempStorage temp_storage;
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
+
+    // Test histo (writing directly to histogram buffer in global)
+    BlockHistogram(temp_storage).Histogram(data, d_histogram);
+}
+
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    int             BINS,
+    typename        SampleT>
+void Initialize(
+    GenMode         gen_mode,
+    SampleT         *h_samples,
+    int             *h_histograms_linear,
+    int             num_samples)
+{
+    // Init bins
+    for (int bin = 0; bin < BINS; ++bin)
+    {
+        h_histograms_linear[bin] = 0;
+    }
+
+    if (g_verbose) printf("Samples: \n");
+
+    // Initialize interleaved channel samples and histogram them correspondingly
+    for (int i = 0; i < num_samples; ++i)
+    {
+        InitValue(gen_mode, h_samples[i], i);
+        h_samples[i] %= BINS;
+
+        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";
+
+        h_histograms_linear[h_samples[i]]++;
+    }
+
+    if (g_verbose) printf("\n\n");
+}
+
+
+/**
+ * Test BlockHistogram
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test(
+    GenMode                     gen_mode)
+{
+    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
+        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
+        num_samples,
+        typeid(SampleT).name(),
+        (int) sizeof(SampleT),
+        BINS,
+        BLOCK_THREADS,
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    SampleT         *h_samples          = new SampleT[num_samples];
+    int   *h_reference = new int[BINS];
+
+    // Initialize problem
+    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);
+
+    // Allocate problem device arrays
+    SampleT         *d_samples = NULL;
+    int             *d_histogram = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));
+
+    // Initialize/clear device arrays
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));
+
+    // Run kernel
+    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
+        d_samples,
+        d_histogram);
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
+    printf("\t%s\n\n", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (h_reference) delete[] h_reference;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test different sample distributions
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD,
+    BlockHistogramAlgorithm     ALGORITHM>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
+}
+
+
+/**
+ * Test different ALGORITHM
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS,
+    int                         ITEMS_PER_THREAD>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
+    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
+}
+
+
+/**
+ * Test different ITEMS_PER_THREAD
+ */
+template <
+    typename                    SampleT,
+    int                         BINS,
+    int                         BLOCK_THREADS>
+void Test()
+{
+    Test<SampleT, BINS, BLOCK_THREADS, 1>();
+    Test<SampleT, BINS, BLOCK_THREADS, 5>();
+}
+
+
+/**
+ * Test different BLOCK_THREADS
+ */
+template <
+    typename                    SampleT,
+    int                         BINS>
+void Test()
+{
+    Test<SampleT, BINS, 32>();
+    Test<SampleT, BINS, 96>();
+    Test<SampleT, BINS, 128>();
+}
+
+
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<total input samples across all channels> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
+    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        Test<unsigned char, 32>();
+        Test<unsigned char, 256>();
+        Test<unsigned short, 1024>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_block_load_store.cu b/external/cub/test/test_block_load_store.cu
new file mode 100644
index 00000000000..ca6ef1310f7
--- /dev/null
+++ b/external/cub/test/test_block_load_store.cu
@@ -0,0 +1,549 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockLoad and BlockStore utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store kernel.
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    OutputIteratorT    d_out_unguarded,
+    OutputIteratorT    d_out_guarded,
+    int               num_items)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+    // Shared memory type for this thread block
+    union TempStorage
+    {
+        typename BlockLoad::TempStorage     load;
+        typename BlockStore::TempStorage    store;
+    };
+
+    // Allocate temp storage in shared memory
+    __shared__ TempStorage temp_storage;
+
+    // Threadblock work bounds
+    int block_offset = blockIdx.x * TILE_SIZE;
+    int guarded_elements = num_items - block_offset;
+
+    // Tile of items
+    OutputT data[ITEMS_PER_THREAD];
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
+
+    __syncthreads();
+
+    // reset data
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        data[ITEM] = OutputT();
+
+    __syncthreads();
+
+    // Load data
+    BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
+
+    __syncthreads();
+
+    // Store data
+    BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
+}
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Test load/store variants
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    typename            InputIteratorT,
+    typename            OutputIteratorT>
+void TestKernel(
+    T                   *h_in,
+    InputIteratorT      d_in,
+    OutputIteratorT      d_out_unguarded_itr,
+    OutputIteratorT      d_out_guarded_itr,
+    T                   *d_out_unguarded_ptr,
+    T                   *d_out_guarded_ptr,
+    int                 grid_size,
+    int                 guarded_elements)
+{
+    int compare;
+
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Test with discard output iterator
+    typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
+    DiscardOutputIterator<OffsetT> discard_itr;
+
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            discard_itr,
+            discard_itr,
+            guarded_elements);
+
+    // Test with regular output iterator
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
+        <<<grid_size, BLOCK_THREADS>>>(
+            d_in,
+            d_out_unguarded_itr,
+            d_out_guarded_itr,
+            guarded_elements);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
+    printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check results
+    compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
+    printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test native pointer.  Specialized for sufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      sufficient_resources)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestNative "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        (T const *) d_in,   // Test const
+        d_out_unguarded,
+        d_out_guarded,
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+
+/**
+ * Test native pointer.  Specialized for insufficient resources
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM>
+void TestNative(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<false>      sufficient_resources)
+{}
+
+
+/**
+ * Test iterator.  Specialized for sufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<true>      sufficient_resources)
+{
+    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
+    int guarded_elements = int(fraction_valid * float(unguarded_elements));
+
+    // Allocate host arrays
+    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
+
+    // Allocate device arrays
+    T *d_in = NULL;
+    T *d_out_unguarded = NULL;
+    T *d_out_guarded = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
+    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
+    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
+
+    // Initialize problem on host and device
+    for (int i = 0; i < unguarded_elements; ++i)
+    {
+        InitValue(INTEGER_SEED, h_in[i], i);
+    }
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
+
+    printf("TestIterator "
+        "grid_size(%d) "
+        "guarded_elements(%d) "
+        "unguarded_elements(%d) "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "LOAD_ALGORITHM(%d) "
+        "STORE_ALGORITHM(%d) "
+        "LOAD_MODIFIER(%d) "
+        "STORE_MODIFIER(%d) "
+        "sizeof(T)(%d)\n",
+            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
+
+    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
+        h_in,
+        CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
+        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
+        d_out_unguarded,
+        d_out_guarded,
+        grid_size,
+        guarded_elements);
+
+    // Cleanup
+    if (h_in) free(h_in);
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
+    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
+}
+
+/**
+ * Test iterator.  Specialized for insufficient resources.
+ */
+template <
+    typename            T,
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  LOAD_ALGORITHM,
+    BlockStoreAlgorithm STORE_ALGORITHM,
+    CacheLoadModifier   LOAD_MODIFIER,
+    CacheStoreModifier  STORE_MODIFIER>
+void TestIterator(
+    int                 grid_size,
+    float               fraction_valid,
+    Int2Type<false>     sufficient_resources)
+{}
+
+
+/**
+ * Evaluate different pointer access types
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestPointerType(
+    int             grid_size,
+    float           fraction_valid)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
+    typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 512;
+#else
+    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
+    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
+    static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
+#endif
+
+    static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
+
+    TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+    TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
+}
+
+
+/**
+ * Evaluate different time-slicing strategies
+ */
+template <
+    typename                T,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    BlockLoadAlgorithm      LOAD_ALGORITHM,
+    BlockStoreAlgorithm     STORE_ALGORITHM>
+void TestSlicedStrategy(
+    int             grid_size,
+    float           fraction_valid)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
+}
+
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<false> is_warp_multiple)
+{
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
+ */
+template <
+    typename        T,
+    int             BLOCK_THREADS,
+    int             ITEMS_PER_THREAD>
+void TestStrategy(
+    int             grid_size,
+    float           fraction_valid,
+    Int2Type<true>  is_warp_multiple)
+{
+    TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
+    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Evaluate different register blocking
+ */
+template <
+    typename T,
+    int BLOCK_THREADS>
+void TestItemsPerThread(
+    int grid_size,
+    float fraction_valid)
+{
+    Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
+
+    TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
+    TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
+}
+
+
+/**
+ * Evaluate different thread block sizes
+ */
+template <typename T>
+void TestThreads(
+    int grid_size,
+    float fraction_valid)
+{
+    TestItemsPerThread<T, 15>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 32>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 72>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 96>(grid_size, fraction_valid);
+    TestItemsPerThread<T, 128>(grid_size, fraction_valid);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
+    TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
+
+#else
+
+    // Compile/run thorough tests
+    TestThreads<char>(2, 0.8f);
+    TestThreads<int>(2, 0.8f);
+    TestThreads<long>(2, 0.8f);
+    TestThreads<long2>(2, 0.8f);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        TestThreads<double2>(2, 0.8f);
+    TestThreads<TestFoo>(2, 0.8f);
+    TestThreads<TestBar>(2, 0.8f);
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_block_radix_sort.cu b/external/cub/test/test_block_radix_sort.cu
new file mode 100644
index 00000000000..b3418dae6b2
--- /dev/null
+++ b/external/cub/test/test_block_radix_sort.cu
@@ -0,0 +1,717 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <iostream>
+
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Specialized descending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized descending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<true>              is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> blocked
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<true>              is_blocked_output)
+{
+    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectBlocked(threadIdx.x, d_keys, keys);
+    StoreDirectBlocked(threadIdx.x, d_values, values);
+}
+
+/// Specialized ascending, blocked -> striped
+template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
+__device__ __forceinline__ void TestBlockSort(
+    typename BlockRadixSort::TempStorage &temp_storage,
+    Key                         (&keys)[ITEMS_PER_THREAD],
+    Value                       (&values)[ITEMS_PER_THREAD],
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     &stop,
+    Int2Type<false>             is_descending,
+    Int2Type<false>             is_blocked_output)
+{
+    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
+    stop = clock();
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
+    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
+}
+
+
+
+/**
+ * BlockRadixSort kernel
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    int                 RADIX_BITS,
+    bool                MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig SMEM_CONFIG,
+    int                 DESCENDING,
+    int                 BLOCKED_OUTPUT,
+    typename            Key,
+    typename            Value>
+__launch_bounds__ (BLOCK_THREADS, 1)
+__global__ void Kernel(
+    Key                         *d_keys,
+    Value                       *d_values,
+    int                         begin_bit,
+    int                         end_bit,
+    clock_t                     *d_elapsed)
+{
+    // Threadblock load/store abstraction types
+    typedef BlockRadixSort<
+            Key,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            Value,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG>
+        BlockRadixSortT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockRadixSortT::TempStorage temp_storage;
+
+    // Items per thread
+    Key     keys[ITEMS_PER_THREAD];
+    Value   values[ITEMS_PER_THREAD];
+
+    LoadDirectBlocked(threadIdx.x, d_keys, keys);
+    LoadDirectBlocked(threadIdx.x, d_values, values);
+
+    // Start cycle timer
+    clock_t stop;
+    clock_t start = clock();
+
+    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
+        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
+
+    // Store time
+    if (threadIdx.x == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename Key,
+    typename Value,
+    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename Key, typename Value>
+struct Pair<Key, Value, true>
+{
+    Key     key;
+    Value   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // Key in unsigned bits
+        typedef typename Traits<Key>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
+        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&b.key));
+        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key-value sorting problem.
+ */
+template <bool DESCENDING, typename Key, typename Value>
+void Initialize(
+    GenMode         gen_mode,
+    Key             *h_keys,
+    Value           *h_values,
+    Key             *h_reference_keys,
+    Value           *h_reference_values,
+    int             num_items,
+    int             entropy_reduction,
+    int             begin_bit,
+    int             end_bit)
+{
+    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_keys[i], i);
+
+        RandomBits(h_values[i]);
+
+        // Mask off unwanted portions
+        int num_bits = end_bit - begin_bit;
+        if ((begin_bit > 0) || (end_bit < sizeof(Key) * 8))
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(Key));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_keys[i], &base, sizeof(Key));
+        }
+
+        h_pairs[i].key    = h_keys[i];
+        h_pairs[i].value  = h_values[i];
+    }
+
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+    std::stable_sort(h_pairs, h_pairs + num_items);
+    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_keys[i]     = h_pairs[i].key;
+        h_reference_values[i]   = h_pairs[i].value;
+    }
+
+    delete[] h_pairs;
+}
+
+
+
+
+/**
+ * Test BlockRadixSort kernel
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestDriver(
+    GenMode                 gen_mode,
+    int                     entropy_reduction,
+    int                     begin_bit,
+    int                     end_bit)
+{
+    enum
+    {
+        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
+        KEYS_ONLY = Equals<Value, NullType>::VALUE,
+    };
+
+    // Allocate host arrays
+    Key     *h_keys             = new Key[TILE_SIZE];
+    Key     *h_reference_keys   = new Key[TILE_SIZE];
+    Value   *h_values           = new Value[TILE_SIZE];
+    Value   *h_reference_values = new Value[TILE_SIZE];
+
+    // Allocate device arrays
+    Key     *d_keys     = NULL;
+    Value   *d_values   = NULL;
+    clock_t *d_elapsed  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+
+    // Initialize problem and solution on host
+    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
+        TILE_SIZE, entropy_reduction, begin_bit, end_bit);
+
+    // Copy problem to device
+    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));
+
+    printf("%s "
+        "BLOCK_THREADS(%d) "
+        "ITEMS_PER_THREAD(%d) "
+        "RADIX_BITS(%d) "
+        "MEMOIZE_OUTER_SCAN(%d) "
+        "INNER_SCAN_ALGORITHM(%d) "
+        "SMEM_CONFIG(%d) "
+        "DESCENDING(%d) "
+        "BLOCKED_OUTPUT(%d) "
+        "sizeof(Key)(%d) "
+        "sizeof(Value)(%d) "
+        "gen_mode(%d), "
+        "entropy_reduction(%d) "
+        "begin_bit(%d) "
+        "end_bit(%d), "
+        "samples(%d)\n",
+            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            RADIX_BITS,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            DESCENDING,
+            BLOCKED_OUTPUT,
+            (int) sizeof(Key),
+            (int) sizeof(Value),
+            gen_mode,
+            entropy_reduction,
+            begin_bit,
+            end_bit,
+            g_num_rand_samples);
+
+    // Set shared memory config
+    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);
+
+    // Run kernel
+    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
+        d_keys, d_values, begin_bit, end_bit, d_elapsed);
+
+    // Flush kernel output / errors
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check keys results
+    printf("\tKeys: ");
+    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check value results
+    if (!KEYS_ONLY)
+    {
+        printf("\tValues: ");
+        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+    printf("\n");
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+    printf("\n");
+
+    // Cleanup
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
+    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
+    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test driver (valid tile size <= MAX_SMEM_BYTES)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<true> fits_smem_capacity)
+{
+    // Iterate begin_bit
+    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
+    {
+        // Iterate end bit
+        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
+        {
+            // Uniform key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                UNIFORM, 0, begin_bit, end_bit);
+
+            // Sequential key distribution
+            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                INTEGER_SEED, 0, begin_bit, end_bit);
+
+            // Iterate random with entropy_reduction
+            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
+            {
+                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
+                    RANDOM, entropy_reduction, begin_bit, end_bit);
+            }
+        }
+    }
+}
+
+
+/**
+ * Test driver (invalid tile size)
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    bool                    DESCENDING,
+    bool                    BLOCKED_OUTPUT,
+    typename                Key,
+    typename                Value>
+void TestValid(Int2Type<false> fits_smem_capacity)
+{}
+
+
+/**
+ * Test ascending/descending and to-blocked/to-striped
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    cudaSharedMemConfig     SMEM_CONFIG,
+    typename                Key,
+    typename                Value>
+void Test()
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
+
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
+#else
+    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
+#endif
+
+    // Sort-ascending, to-striped
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
+
+    // Sort-descending, to-blocked
+    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
+
+    // Not necessary
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
+//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeys()
+{
+    // Test keys-only sorting with both smem configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
+#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
+#endif
+}
+
+
+/**
+ * Test value type and smem config
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
+    typename                Key>
+void TestKeysAndPairs()
+{
+    // Test pairs sorting with only 4-byte configs
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
+}
+
+
+/**
+ * Test key type
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
+void Test()
+{
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef TEST_KEYS_ONLY
+
+    // Test unsigned types with keys-only
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned char>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned short>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned int>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long>();
+    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long long>();
+
+#else
+
+    // Test signed and fp types with paired values
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long long>();
+    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, float>();
+    if (ptx_version > 120)
+    {
+        // Don't check doubles on PTX120 or below because they're down-converted
+        TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, double>();
+    }
+
+#endif
+}
+
+
+/**
+ * Test inner scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS,
+    bool                    MEMOIZE_OUTER_SCAN>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_RAKING>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_WARP_SCANS>();
+}
+
+
+/**
+ * Test outer scan algorithm
+ */
+template <
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    int                     RADIX_BITS>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, true>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, false>();
+}
+
+
+/**
+ * Test radix bits
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 5>();
+}
+
+
+/**
+ * Test items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+#if defined(SM100) || defined(SM110) || defined(SM130)
+    // Open64 compiler can't handle the number of test cases
+#else
+    Test<BLOCK_THREADS, 4>();
+#endif
+    Test<BLOCK_THREADS, 11>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    {
+        typedef float T;
+        TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8);
+    }
+/*
+    // Compile/run quick tests
+    typedef unsigned int T;
+    TestDriver<64, 17, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<96, 8, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+    TestDriver<128, 2, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
+*/
+
+#else
+
+    // Compile/run thorough tests
+    Test<32>();
+    Test<64>();
+    Test<160>();
+
+
+#endif  // QUICK_TEST
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_block_reduce.cu b/external/cub/test/test_block_reduce.cu
new file mode 100644
index 00000000000..23261582c16
--- /dev/null
+++ b/external/cub/test/test_block_reduce.cu
@@ -0,0 +1,822 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <device_functions.h>
+#include <typeinfo>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+#include <cub/util_debug.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+
+/// Generic reduction (full, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data[0], reduction_op);
+}
+
+/// Generic reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
+{
+    return block_reduce.Reduce(data, reduction_op);
+}
+
+/// Generic reduction (partial, 1)
+template <typename BlockReduceT, typename T, typename ReductionOp>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
+{
+    return block_reduce.Reduce(data, reduction_op, valid_threads);
+}
+
+/// Sum reduction (full, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
+{
+    return block_reduce.Sum(data[0]);
+}
+
+/// Sum reduction (full, ITEMS_PER_THREAD)
+template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
+{
+    return block_reduce.Sum(data);
+}
+
+/// Sum reduction (partial, 1)
+template <typename BlockReduceT, typename T>
+__device__ __forceinline__ T DeviceTest(
+    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
+{
+    return block_reduce.Sum(data, valid_threads);
+}
+
+
+/**
+ * Test full-tile reduction kernel (where num_items is an even
+ * multiple of BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void FullTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    ReductionOp             reduction_op,
+    int                     tiles,
+    clock_t                 *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Cooperative thread block reduction utility type (returns aggregate in thread 0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+
+    // Load first tile of data
+    int block_offset = 0;
+
+    if (block_offset < TILE_SIZE * tiles)
+    {
+        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+        block_offset += TILE_SIZE;
+
+        // Start cycle timer
+        clock_t start = clock();
+
+        // Cooperative reduce first tile
+        BlockReduceT block_reduce(temp_storage) ;
+        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+        // Stop cycle timer
+ #if CUB_PTX_ARCH == 100
+        // Bug: recording stop clock causes mis-write of running prefix value
+        clock_t stop = 0;
+#else
+        clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+        clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+        // Loop over input tiles
+        while (block_offset < TILE_SIZE * tiles)
+        {
+            // TestBarrier between thread block reductions
+            __syncthreads();
+    
+            // Load tile of data
+            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
+            block_offset += TILE_SIZE;
+
+            // Start cycle timer
+            clock_t start = clock();
+
+            // Cooperatively reduce the tile's aggregate
+            BlockReduceT block_reduce(temp_storage) ;
+            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
+
+            // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+            // Bug: recording stop clock causes mis-write of running prefix value
+            clock_t stop = 0;
+#else
+            clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+            elapsed += (start > stop) ? start - stop : stop - start;
+
+            // Reduce thread block aggregate
+            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
+        }
+
+        // Store data
+        if (linear_tid == 0)
+        {
+            d_out[0] = block_aggregate;
+            *d_elapsed = elapsed;
+        }
+    }
+}
+
+
+
+/**
+ * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void PartialTileReduceKernel(
+    T                       *d_in,
+    T                       *d_out,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    clock_t                 *d_elapsed)
+{
+    // Cooperative thread block reduction utility type (returns aggregate only in thread-0)
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockReduceT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T partial;
+
+    // Load partial tile data
+    if (linear_tid < num_items)
+    {
+        partial = d_in[linear_tid];
+    }
+
+    // Start cycle timer
+    clock_t start = clock();
+
+    // Cooperatively reduce the tile's aggregate
+    BlockReduceT block_reduce(temp_storage) ;
+    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
+
+    // Stop cycle timer
+#if CUB_PTX_ARCH == 100
+    // Bug: recording stop clock causes mis-write of running prefix value
+    clock_t stop = 0;
+#else
+    clock_t stop = clock();
+#endif // CUB_PTX_ARCH == 100
+
+    clock_t elapsed = (start > stop) ? start - stop : stop - start;
+
+    // Store data
+    if (linear_tid == 0)
+    {
+        d_out[0] = tile_aggregate;
+        *d_elapsed = elapsed;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           h_reference[1],
+    ReductionOp reduction_op,
+    int         num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        if (i == 0)
+            h_reference[0] = h_in[0];
+        else
+            h_reference[0] = reduction_op(h_reference[0], h_in[i]);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n");
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test full-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<true>          sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    int num_items = TILE_SIZE * tiles;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    // Test multi-tile (unguarded)
+    printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
+        Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,
+        tiles,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        reduction_op,
+        tiles,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test full-tile reduction.  (Specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ * Test full-tile reduction.
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
+#else
+        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
+        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
+#endif
+    };
+
+    TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+/**
+ * Run battery of tests for different thread block dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    int                     ITEMS_PER_THREAD,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
+}
+
+/**
+ * Run battery of tests for different thread items
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    int                     tiles,
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
+    TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
+}
+
+
+/**
+ * Run battery of full-tile tests for different numbers of tiles
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestFullTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (int tiles = 1; tiles < 3; tiles++)
+    {
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Partial-tile test generation
+//---------------------------------------------------------------------
+
+/**
+ * Test partial-tile reduction.  (Specialized for sufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<true>          sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS;
+
+    // Allocate host arrays
+    T *h_in = new T[num_items];
+    T h_reference[1];
+
+    // Initialize problem
+    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
+
+    printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
+        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
+        gen_mode,
+        num_items,
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        num_items,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+
+/**
+ * Test partial-tile reduction (specialized for insufficient resources)
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op,
+    Int2Type<false>         sufficient_resources)
+{}
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_DIM_X,
+    int                     BLOCK_DIM_Y,
+    int                     BLOCK_DIM_Z,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    int                     num_items,
+    ReductionOp             reduction_op)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
+
+    enum 
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 16 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 512,
+#else
+        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 48 * 1024,
+        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 1024,
+#endif
+    };
+
+    TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
+}
+
+
+
+/**
+ *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void TestPartialTile(
+    GenMode                 gen_mode,
+    ReductionOp             reduction_op)
+{
+    for (
+        int num_items = 1;
+        num_items < BLOCK_THREADS;
+        num_items += CUB_MAX(1, BLOCK_THREADS / 5))
+    {
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Run battery of full-tile tests for different gen modes
+ */
+template <
+    BlockReduceAlgorithm    ALGORITHM,
+    int                     BLOCK_THREADS,
+    typename                T,
+    typename                ReductionOp>
+void Test(
+    ReductionOp             reduction_op)
+{
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
+
+    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
+
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+    {
+        // Don't test randomly-generated floats b/c of stability
+        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+        TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different block-reduction algorithmic variants
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+#ifdef TEST_RAKING
+    Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
+    Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
+#endif
+#ifdef TEST_WARP_REDUCTIONS
+    Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
+#endif
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <
+    typename        T,
+    typename        ReductionOp>
+void Test(
+    ReductionOp     reduction_op)
+{
+    Test<7,   T>(reduction_op);
+    Test<32,  T>(reduction_op);
+    Test<63,  T>(reduction_op);
+    Test<97,  T>(reduction_op);
+    Test<128, T>(reduction_op);
+    Test<238, T>(reduction_op);
+}
+
+
+/**
+ * Run battery of tests for different block sizes
+ */
+template <typename T>
+void Test()
+{
+    Test<T>(Sum());
+    Test<T>(Max());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+
+
+    printf("\n full tile ------------------------\n\n");
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 4, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 4, int>(RANDOM, 1, Sum());
+
+    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 1, int>(RANDOM, 1, Sum());
+    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 1, int>(RANDOM, 1, Sum());
+
+    printf("\n partial tile ------------------------\n\n");
+
+    TestPartialTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, int>(RANDOM, 7, Sum());
+    TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, int>(RANDOM, 7, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // primitives
+        Test<char>();
+        Test<short>();
+        Test<int>();
+        Test<long long>();
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            Test<double>();
+
+        Test<float>();
+
+        // vector types
+        Test<char2>();
+        Test<short2>();
+        Test<int2>();
+        Test<longlong2>();
+
+        Test<char4>();
+        Test<short4>();
+        Test<int4>();
+        Test<longlong4>();
+
+        // Complex types
+        Test<TestFoo>();
+        Test<TestBar>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
diff --git a/external/cub/test/test_block_scan.cu b/external/cub/test/test_block_scan.cu
new file mode 100644
index 00000000000..033c89ee094
--- /dev/null
+++ b/external/cub/test/test_block_scan.cu
@@ -0,0 +1,929 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of BlockScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <iostream>
+#include <limits>
+#include <typeinfo>
+
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_store.cuh>
+#include <cub/util_ptx.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+    PREFIX,
+};
+
+
+/**
+ * Scan mode to test
+ */
+enum ScanMode
+{
+    EXCLUSIVE,
+    INCLUSIVE
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+/**
+ * Stateful prefix functor
+ */
+template <
+    typename T,
+    typename ScanOpT>
+struct BlockPrefixCallbackOp
+{
+    int     linear_tid;
+    T       prefix;
+    ScanOpT  scan_op;
+
+    __device__ __forceinline__
+    BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) :
+        linear_tid(linear_tid),
+        prefix(prefix),
+        scan_op(scan_op)
+    {}
+
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+        // For testing purposes
+        T retval = (linear_tid == 0) ? prefix  : T();
+        prefix = scan_op(prefix, block_aggregate);
+        return retval;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Exclusive scan
+//---------------------------------------------------------------------
+
+/// Exclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op);
+}
+
+/// Exclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate);
+}
+
+/// Exclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Exclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.ExclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Exclusive sum
+//---------------------------------------------------------------------
+
+/// Exclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0]);
+}
+
+/// Exclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data);
+}
+
+/// Exclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, block_aggregate);
+}
+
+/// Exclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Exclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.ExclusiveSum(data, data, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive scan
+//---------------------------------------------------------------------
+
+/// Inclusive scan (BASIC, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op);
+}
+
+/// Inclusive scan (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate);
+}
+
+/// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, block_aggregate);
+}
+
+/// Inclusive scan (PREFIX, 1)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op);
+}
+
+/// Inclusive scan (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
+{
+    block_scan.InclusiveScan(data, data, scan_op, prefix_op);
+}
+
+
+//---------------------------------------------------------------------
+// Inclusive sum
+//---------------------------------------------------------------------
+
+/// Inclusive sum (BASIC, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0]);
+}
+
+/// Inclusive sum (BASIC, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum (AGGREGATE, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], block_aggregate);
+}
+
+/// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, block_aggregate);
+}
+
+/// Inclusive sum (PREFIX, 1)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data[0], data[0], prefix_op);
+}
+
+/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
+template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void DeviceTest(
+    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
+    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
+{
+    block_scan.InclusiveSum(data, data, prefix_op);
+}
+
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * BlockScan test kernel.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            T,
+    typename            ScanOpT>
+__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
+__global__ void BlockScanKernel(
+    T                   *d_in,
+    T                   *d_out,
+    T                   *d_aggregate,
+    ScanOpT              scan_op,
+    T                   initial_value,
+    clock_t             *d_elapsed)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Parameterize BlockScan type for our thread block
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename BlockScanT::TempStorage temp_storage;
+
+    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+
+    // Per-thread tile data
+    T data[ITEMS_PER_THREAD];
+    LoadDirectBlocked(linear_tid, d_in, data);
+
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test scan
+    T                                   block_aggregate;
+    BlockScanT                          block_scan(temp_storage);
+    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);
+
+    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
+        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store output
+    StoreDirectBlocked(linear_tid, d_out, data);
+
+    // Store block_aggregate
+    if (TEST_MODE != BASIC)
+        d_aggregate[linear_tid] = block_aggregate;
+
+    // Store prefix
+    if (TEST_MODE == PREFIX)
+    {
+        if (linear_tid == 0)
+            d_out[TILE_SIZE] = prefix_op.prefix;
+    }
+
+    // Store time
+    if (linear_tid == 0)
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+}
+
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT     scan_op,
+    T           initial_value,
+    Int2Type<EXCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    h_reference[0]      = initial_value;
+    T inclusive         = scan_op(initial_value, h_in[0]);
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        h_reference[i] = inclusive;
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <typename T, typename ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT      scan_op,
+    T           initial_value,
+    Int2Type<INCLUSIVE>)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    T inclusive         = scan_op(initial_value, h_in[0]);
+    h_reference[0]      = inclusive;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+        h_reference[i] = inclusive;
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for sufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value,
+    Int2Type<true>      sufficient_resources)
+{
+    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
+    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
+
+    // Allocate host arrays
+    T *h_in = new T[TILE_SIZE];
+    T *h_reference = new T[TILE_SIZE];
+    T *h_aggregate = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    T block_aggregate = Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        TILE_SIZE,
+        scan_op,
+        initial_value,
+        Int2Type<SCAN_MODE>());
+
+    // Test reference block_aggregate is returned in all threads
+    for (int i = 0; i < BLOCK_THREADS; ++i)
+    {
+        h_aggregate[i] = block_aggregate;
+    }
+
+    // Run kernel
+    printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n",
+        TEST_MODE, gen_mode, ALGORITHM,
+        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
+        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
+        ITEMS_PER_THREAD,  TILE_SIZE,
+        typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Initialize/clear device arrays
+    T       *d_in = NULL;
+    T       *d_out = NULL;
+    T       *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
+
+    // Display input problem data
+    if (g_verbose)
+    {
+        printf("Input data: ");
+        for (int i = 0; i < TILE_SIZE; i++)
+        {
+            std::cout << CoutCast(h_in[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Run block_aggregate/prefix kernel
+    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
+    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (TEST_MODE == AGGREGATE)
+    {
+        // Copy out and display block_aggregate
+        printf("\tScan block aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    if (TEST_MODE == PREFIX)
+    {
+        // Copy out and display updated prefix
+        printf("\tScan running total: ");
+        T running_total = scan_op(initial_value, block_aggregate);
+        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test thread block scan.  (Specialized for insufficient resources)
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value,
+    Int2Type<false>     sufficient_resources)
+{}
+
+
+/**
+ * Test thread block scan.
+ */
+template <
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y,
+    int                 BLOCK_DIM_Z,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode             gen_mode,
+    ScanOpT             scan_op,
+    T                   initial_value)
+{
+    // Check size of smem storage for the target arch to make sure it will fit
+    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
+
+    enum
+    {
+#if defined(SM100) || defined(SM110) || defined(SM130)
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 512),
+#else
+        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
+        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 1024),
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+        // Accommodate ptxas crash bug (access violation) on Windows
+        special_skip            = ((TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1)),
+#else
+        special_skip            = false,
+#endif
+        sufficient_resources    = (sufficient_smem && sufficient_threads && !special_skip),
+    };
+
+    Test<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(
+        gen_mode, scan_op, initial_value, Int2Type<sufficient_resources>());
+}
+
+
+
+/**
+ * Run test for different thread block dimensions
+ */
+template <
+    int                 BLOCK_THREADS,
+    int                 ITEMS_PER_THREAD,
+    ScanMode            SCAN_MODE,
+    TestMode            TEST_MODE,
+    BlockScanAlgorithm  ALGORITHM,
+    typename            ScanOpT,
+    typename            T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+    Test<BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run test for different policy types
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    ScanMode    SCAN_MODE,
+    TestMode    TEST_MODE,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+#ifdef TEST_RAKING
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_RAKING_MEMOIZE
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING_MEMOIZE>(gen_mode, scan_op, initial_value);
+#endif
+#ifdef TEST_WARP_SCANS
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_WARP_SCANS>(gen_mode, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Run tests for different primitive variants
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, scan_op, identity);
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, BASIC>(gen_mode, scan_op, identity);      // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);  // This scan doesn't take an initial value
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, PREFIX>(gen_mode, scan_op, initial_value);
+}
+
+
+/**
+ * Run tests for different problem-generation options
+ */
+template <
+    int         BLOCK_THREADS,
+    int         ITEMS_PER_THREAD,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    ScanOpT     scan_op,
+    T           identity,
+    T           initial_value)
+{
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(UNIFORM, scan_op, identity, initial_value);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(INTEGER_SEED, scan_op, identity, initial_value);
+
+    // Don't test randomly-generated floats b/c of stability
+    if (Traits<T>::CATEGORY != FLOATING_POINT)
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(RANDOM, scan_op, identity, initial_value);
+}
+
+
+/**
+ * Run tests for different data types and scan ops
+ */
+template <
+    int BLOCK_THREADS,
+    int ITEMS_PER_THREAD>
+void Test()
+{
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned char) 0, (unsigned char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned short) 0, (unsigned short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned int) 0, (unsigned int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned long long) 0, (unsigned long long) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (float) 0, (float) 99);
+
+    // primitive (alternative scan op)
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<char>::min(), (char) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<short>::min(), (short) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<int>::min(), (int) 99);
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<long long>::min(), (long long) 99);
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<double>::max() * -1, (double) 99);
+
+    // vec-1
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar1(0), make_uchar1(17));
+
+    // vec-2
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar2(0, 0), make_uchar2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ushort2(0, 0), make_ushort2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uint2(0, 0), make_uint2(17, 21));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21));
+
+    // vec-4
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85));
+
+    // complex
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run tests for different items per thread
+ */
+template <int BLOCK_THREADS>
+void Test()
+{
+    Test<BLOCK_THREADS, 1>();
+    Test<BLOCK_THREADS, 2>();
+    Test<BLOCK_THREADS, 9>();
+}
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+
+    // Compile/run quick tests
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0));
+    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0));
+
+    Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85));
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Run tests for different thread block sizes
+        Test<17>();
+        Test<32>();
+        Test<62>();
+        Test<65>();
+//            Test<96>();             // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21));
+        Test<128>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/external/cub/test/test_device_histogram.cu b/external/cub/test/test_device_histogram.cu
new file mode 100644
index 00000000000..b77b7391041
--- /dev/null
+++ b/external/cub/test/test_device_histogram.cu
@@ -0,0 +1,1669 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceHistogram utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <algorithm>
+#include <typeinfo>
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    #include <npp.h>
+#endif
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_histogram.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    NPP,        // NPP method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+bool                    g_verbose_input     = false;
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to NPP histogram
+//---------------------------------------------------------------------
+
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+/**
+ * Dispatch to single-channel 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<1>             num_channels,
+    Int2Type<1>             num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_C1R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram[0],
+                num_levels[0],
+                lower_level[0],
+                upper_level[0],
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+/**
+ * Dispatch to 3/4 8b NPP histo-even
+ */
+template <typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<4>          num_channels,
+    Int2Type<3>   num_active_channels,
+    Int2Type<NPP>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef unsigned char SampleT;
+
+    cudaError_t error = cudaSuccess;
+    NppiSize oSizeROI = {
+        num_row_pixels,
+        num_rows
+    };
+
+    if (d_temp_storage_bytes == NULL)
+    {
+        int nDeviceBufferSize;
+        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
+        temp_storage_bytes = nDeviceBufferSize;
+    }
+    else
+    {
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            // compute the histogram
+            nppiHistogramEven_8u_AC4R(
+                d_samples,
+                row_stride_bytes,
+                oSizeROI,
+                d_histogram,
+                num_levels,
+                lower_level,
+                upper_level,
+                (Npp8u*) d_temp_storage);
+        }
+    }
+
+    return error;
+}
+
+
+#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceHistogram entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB single histogram-even entrypoint
+ */
+template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<1>             num_channels,
+    Int2Type<1>             num_active_channels,
+    Int2Type<CUB>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[1],                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[1],                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceHistogram::HistogramEven(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram[0],
+            num_levels[0],
+            lower_level[0],
+            upper_level[0],
+            num_row_pixels,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to CUB multi histogram-even entrypoint
+ */
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchEven(
+    Int2Type<NUM_CHANNELS>          num_channels,
+    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
+    Int2Type<CUB>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_row_pixels,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to CUB single histogram-range entrypoint
+ */
+template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchRange(
+    Int2Type<1>             num_channels,
+    Int2Type<1>             num_active_channels,
+    Int2Type<CUB>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              *d_levels[1],                               ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceHistogram::HistogramRange(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram[0],
+            num_levels[0],
+            d_levels[0],
+            num_row_pixels,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to CUB multi histogram-range entrypoint
+ */
+template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t DispatchRange(
+    Int2Type<NUM_CHANNELS>          num_channels,
+    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
+    Int2Type<CUB>           dispatch_to,
+    int                     timing_timing_iterations,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_row_pixels,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceHistogram
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+__global__ void CnpDispatchKernel(
+    Int2Type<ALGORITHM> algorithm,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
+    int                 num_samples,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/ **
+ * Dispatch to CDP kernel
+ * /
+template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
+cudaError_t Dispatch(
+    Int2Type<ALGORITHM> algorithm,
+    Int2Type<true>      use_cdp,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    SampleT             *d_samples,
+    SampleIteratorT      d_sample_itr,
+    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
+    int                 num_samples,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
+
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+*/
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+// Searches for bin given a list of bin-boundary levels
+template <typename LevelT>
+struct SearchTransform
+{
+    LevelT          *levels;      // Pointer to levels array
+    int             num_levels;   // Number of levels in array
+
+    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
+        if (bin < 0)
+        {
+            // Sample out of range
+            return num_levels;
+        }
+        return bin;
+    }
+};
+
+
+// Scales samples to evenly-spaced bins
+template <typename LevelT>
+struct ScaleTransform
+{
+    int    num_levels;  // Number of levels in array
+    LevelT max;         // Max sample level (exclusive)
+    LevelT min;         // Min sample level (inclusive)
+    LevelT scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        LevelT max,         // Max sample level (exclusive)
+        LevelT min,         // Min sample level (inclusive)
+        LevelT scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((LevelT) sample) - min) / scale);
+    }
+};
+
+// Scales samples to evenly-spaced bins
+template <>
+struct ScaleTransform<float>
+{
+    int   num_levels;  // Number of levels in array
+    float max;         // Max sample level (exclusive)
+    float min;         // Min sample level (inclusive)
+    float scale;       // Bin scaling factor
+
+    void Init(
+        int    num_levels,  // Number of levels in array
+        float max,         // Max sample level (exclusive)
+        float min,         // Min sample level (inclusive)
+        float scale)       // Bin scaling factor
+    {
+        this->num_levels = num_levels;
+        this->max = max;
+        this->min = min;
+        this->scale = 1.0f / scale;
+    }
+
+    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
+    template <typename SampleT>
+    int operator()(SampleT sample)
+    {
+        if ((sample < min) || (sample >= max))
+        {
+            // Sample out of range
+            return num_levels;
+        }
+
+        return (int) ((((float) sample) - min) * scale);
+    }
+};
+
+
+/**
+ * Generate sample
+ */
+template <typename T, typename LevelT>
+void Sample(T &datum, LevelT max_level, int entropy_reduction)
+{
+    unsigned int max = (unsigned int) -1;
+    unsigned int bits;
+    RandomBits(bits, entropy_reduction);
+    float fraction = (float(bits) / max);
+
+    datum = (T) (fraction * max_level);
+}
+
+
+/**
+ * Initialize histogram samples
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        LevelT,
+    typename        SampleT,
+    typename        OffsetT>
+void InitializeSamples(
+    LevelT          max_level,
+    int             entropy_reduction,
+    SampleT         *h_samples,
+    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    // Initialize samples
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Init sample value
+                Sample(h_samples[offset], max_level, entropy_reduction);
+                if (g_verbose_input)
+                {
+                    if (channel > 0) printf(", ");
+                    std::cout << CoutCast(h_samples[offset]);
+                }
+            }
+        }
+    }
+}
+
+
+/**
+ * Initialize histogram solutions
+ */
+template <
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        SampleIteratorT,
+    typename        TransformOp,
+    typename        OffsetT>
+void InitializeBins(
+    SampleIteratorT h_samples,
+    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    // Init bins
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
+        {
+            h_histogram[CHANNEL][bin] = 0;
+        }
+    }
+
+    // Initialize samples
+    if (g_verbose_input) printf("Samples: \n");
+    for (OffsetT row = 0; row < num_rows; ++row)
+    {
+        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
+        {
+            if (g_verbose_input) printf("[");
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                // Sample offset
+                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
+
+                // Update sample bin
+                int bin = transform_op[channel](h_samples[offset]);
+                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
+                if ((bin >= 0) && (bin < num_levels[channel] - 1))
+                {
+                    // valid bin
+                    h_histogram[channel][bin]++;
+                }
+            }
+            if (g_verbose_input) printf("]");
+        }
+        if (g_verbose_input) printf("\n\n");
+    }
+}
+
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT,
+    typename        SampleIteratorT>
+void TestEven(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+    SampleIteratorT h_samples,
+    SampleIteratorT d_samples)
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+
+        transform_op[channel].Init(
+            num_levels[channel],
+            upper_level[channel],
+            lower_level[channel],
+            ((upper_level[channel] - lower_level[channel]) / bins));
+    }
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+
+    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    DispatchEven(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 8;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    DispatchEven(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    DispatchEven(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+    }
+
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenNative(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    // Allocate and initialize host sample data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT* d_samples = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        h_samples, d_samples);
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+}
+
+
+/**
+ * Test histogram-even (native pointer input)
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEvenIterator(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    SampleT sample = (SampleT) lower_level[0];
+    ConstantInputIterator<SampleT> sample_itr(sample);
+
+    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level,
+        num_row_pixels, num_rows, row_stride_bytes,
+        sample_itr, sample_itr);
+
+}
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    LevelT          max_level,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
+    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+{
+    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
+
+    printf("\n----------------------------\n");
+    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
+        (int) (num_row_pixels * num_rows),
+        (int) num_rows,
+        (int) num_row_pixels,
+        (int) row_stride_bytes,
+        (int) total_samples,
+        (int) sizeof(SampleT),
+        typeid(SampleT).name(),
+        entropy_reduction,
+        typeid(CounterT).name(),
+        NUM_ACTIVE_CHANNELS,
+        NUM_CHANNELS);
+    std::cout << CoutCast(max_level) << "\n";
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
+        std::cout << levels[channel][0];
+        for (int level = 1; level < num_levels[channel]; ++level)
+            std::cout << ", " << levels[channel][level];
+        printf("]\n");
+    }
+    fflush(stdout);
+
+    // Allocate and initialize host and device data
+    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
+    SampleT*                    h_samples = new Foo[total_samples];
+    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
+    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        transform_op[channel].levels = levels[channel];
+        transform_op[channel].num_levels = num_levels[channel];
+
+        int bins = num_levels[channel] - 1;
+        h_histogram[channel] = new CounterT[bins];
+    }
+
+    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
+
+    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Allocate and initialize device data
+    SampleT*        d_samples = NULL;
+    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
+    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
+    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
+        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
+
+        int bins = num_levels[channel] - 1;
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
+        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
+    }
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+
+    DispatchRange(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Allocate temporary storage with "canary" zones
+    int     canary_bytes    = 256;
+    char    canary_token    = 9;
+    char*   canary_zone     = new char[canary_bytes];
+
+    memset(canary_zone, canary_token, canary_bytes);
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
+    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
+
+    // Run warmup/correctness iteration
+    DispatchRange(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
+        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, true);
+
+    // Check canary zones
+    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
+    AssertEquals(0, error);
+
+    // Flush any stdout/stderr
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
+        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
+        error |= channel_error;
+    }
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+
+    DispatchRange(
+        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes,
+        d_samples, d_histogram, num_levels, d_levels,
+        num_row_pixels, num_rows, row_stride_bytes,
+        0, false);
+
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(SampleT);
+        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
+            avg_millis,
+            giga_rate,
+            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
+            giga_rate / NUM_CHANNELS,
+            giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (h_samples) delete[] h_samples;
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        if (h_histogram[channel])
+            delete[] h_histogram[channel];
+
+        if (d_histogram[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
+
+        if (d_levels[channel])
+            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
+    }
+
+    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, error);
+}
+
+
+/**
+ * Test histogram-even
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestEven(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    LevelT lower_level[NUM_ACTIVE_CHANNELS];
+    LevelT upper_level[NUM_ACTIVE_CHANNELS];
+
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    // Set upper and lower levels for each channel
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        int num_bins = num_levels[channel] - 1;
+        lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
+        upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
+    }
+
+    // Test pointer-based samples
+    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+
+    // Test iterator-based samples (CUB-only)
+    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
+}
+
+
+
+/**
+ * Test histogram-range
+ */
+template <
+    Backend         BACKEND,
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestRange(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // Find smallest level increment
+    int max_bins = max_num_levels - 1;
+    LevelT min_level_increment = max_level / max_bins;
+
+    LevelT* levels[NUM_ACTIVE_CHANNELS];
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        levels[channel] = new LevelT[num_levels[channel]];
+
+        int num_bins = num_levels[channel] - 1;
+        LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
+
+        for (int level = 0; level < num_levels[channel]; ++level)
+            levels[channel][level] = lower_level + (level * min_level_increment);
+    }
+
+    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
+        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
+
+    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+        delete[] levels[channel];
+
+}
+
+
+
+/**
+ * Test different entrypoints
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    int             num_levels[NUM_ACTIVE_CHANNELS],
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+/**
+ * Test different number of levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    int             entropy_reduction,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    int num_levels[NUM_ACTIVE_CHANNELS];
+
+// Unnecessary testing
+//    // All the same level
+//    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+//    {
+//        num_levels[channel] = max_num_levels;
+//    }
+//    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+//        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+
+    // All different levels
+    num_levels[0] = max_num_levels;
+    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
+    {
+        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
+    }
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
+}
+
+
+
+/**
+ * Test different entropy-levels
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    OffsetT         row_stride_bytes,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
+
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
+}
+
+
+/**
+ * Test different row strides
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    OffsetT         num_row_pixels,
+    OffsetT         num_rows,
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
+
+    // No padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
+
+    // 13 samples padding
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
+}
+
+
+/**
+ * Test different problem sizes
+ */
+template <
+    typename        SampleT,
+    int             NUM_CHANNELS,
+    int             NUM_ACTIVE_CHANNELS,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void Test(
+    LevelT          max_level,
+    int             max_num_levels)
+{
+    // 0 row/col images
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(0), OffsetT(0), max_level, max_num_levels);
+
+    // 1080 image
+    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
+
+    // Sample different aspect ratios sizes
+    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
+    {
+        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
+        {
+            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+                cols, rows, max_level, max_num_levels);
+        }
+    }
+
+    // Randomly select linear problem size between 1:10,000,000
+    unsigned int max_int = (unsigned int) -1;
+    for (int i = 0; i < 4; ++i)
+    {
+        unsigned int num_items;
+        RandomBits(num_items);
+        num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+        num_items = CUB_MAX(1, num_items);
+
+        Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
+            OffsetT(num_items), 1, max_level, max_num_levels);
+    }
+}
+
+
+
+/**
+ * Test different channel interleavings (valid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          max_level,
+    int             max_num_levels,
+    Int2Type<true>  is_valid_tag)
+{
+    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
+}
+
+
+/**
+ * Test different channel interleavings (invalid specialiation)
+ */
+template <
+    typename        SampleT,
+    typename        CounterT,
+    typename        LevelT,
+    typename        OffsetT>
+void TestChannels(
+    LevelT          max_level,
+    int             max_num_levels,
+    Int2Type<false> is_valid_tag)
+{}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_row_pixels = -1;
+    int entropy_reduction = 0;
+    int num_rows = 1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", num_row_pixels);
+
+    int row_stride_pixels = num_row_pixels;
+
+    args.GetCmdLineArgument("rows", num_rows);
+    args.GetCmdLineArgument("stride", row_stride_pixels);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+#if defined(QUICK_TEST) || defined(QUICKER_TEST)
+    bool compare_npp = args.CheckCmdLineFlag("npp");
+#endif
+
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<pixels per row> "
+            "[--rows=<number of rows> "
+            "[--stride=<row stride in pixels> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "[--v] "
+            "[--cdp]"
+            "[--npp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    if (num_row_pixels < 0)
+    {
+        num_row_pixels      = 1920 * 1080;
+        row_stride_pixels   = num_row_pixels;
+    }
+
+#if defined(QUICKER_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    {
+        // HistogramEven: unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 4/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[4]       = {257, 257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel Unsigned char 256 bins
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 257, 257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+        if (compare_npp)
+            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 256 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
+        typedef float               SampleT;
+        typedef float               LevelT;
+
+         LevelT  max_level           = 1.0;
+         int     num_levels[3]       = {257, 257, 257};
+         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: signed char 256 bins
+        typedef signed char         SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[1]       = {257};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
+        typedef unsigned char       SampleT;
+        typedef int                 LevelT;
+
+        LevelT  max_level           = 256;
+        int     num_levels[3]       = {257, 129, 65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
+
+        TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        // HistogramEven: double [0,1.0] 64 bins
+        typedef double              SampleT;
+        typedef double              LevelT;
+
+        LevelT  max_level           = 1.0;
+        int     num_levels[1]       = {65};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+    {
+        // HistogramEven: short [0,1024] 512 bins
+        typedef unsigned short      SampleT;
+        typedef unsigned short      LevelT;
+
+        LevelT  max_level           = 1024;
+        int     num_levels[1]       = {513};
+        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
+
+        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
+        TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
+        TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
+
+		// Test down-conversion of size_t offsets to int
+        TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/external/cub/test/test_device_radix_sort.cu b/external/cub/test/test_device_radix_sort.cu
new file mode 100644
index 00000000000..e63ca4e4b45
--- /dev/null
+++ b/external/cub/test/test_device_radix_sort.cu
@@ -0,0 +1,1275 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceRadixSort utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <algorithm>
+#include <typeinfo>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_radix_sort.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+#include <thrust/reverse.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,                        // CUB method (allows overwriting of input)
+    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
+
+    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
+    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
+
+    THRUST,                     // Thrust method
+    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         is_descending,
+    Int2Type<CUB>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>             is_descending,
+    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
+    int                         *d_selector,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+/**
+ * Dispatch to CUB sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          is_descending,
+    Int2Type<CUB>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+}
+
+
+/**
+ * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>              is_descending,
+    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
+    int                         *d_selector,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different DeviceRadixSort entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>         is_descending,
+    Int2Type<CUB_SEGMENTED> dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<false>                         is_descending,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
+    int                                     *d_selector,
+    size_t                                  *d_temp_storage_bytes,
+    cudaError_t                             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+/**
+ * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>          is_descending,
+    Int2Type<CUB_SEGMENTED> dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    return DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        d_keys, d_values,
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+}
+
+/**
+ * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
+ */
+template <typename KeyT, typename ValueT>
+CUB_RUNTIME_FUNCTION
+__forceinline__
+cudaError_t Dispatch(
+    Int2Type<true>                          is_descending,
+    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
+    int                                     *d_selector,
+    size_t                                  *d_temp_storage_bytes,
+    cudaError_t                             *d_cdp_error,
+
+    void*                   d_temp_storage,
+    size_t&                 temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    KeyT      const *const_keys_itr     = d_keys.Current();
+    ValueT    const *const_values_itr   = d_values.Current();
+
+    cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes,
+        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
+        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
+        begin_bit, end_bit, stream, debug_synchronous);
+
+    d_keys.selector ^= 1;
+    d_values.selector ^= 1;
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch keys-only to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<THRUST>        dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<NullType>  &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
+
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+        thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
+        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch key-value pairs to Thrust sorting entrypoint
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<THRUST>        dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
+        thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+
+        thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
+
+        if (IS_DESCENDING) {
+            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
+            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRadixSort
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+__global__ void CnpDispatchKernel(
+    Int2Type<IS_DESCENDING> is_descending,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  temp_storage_bytes,
+    DoubleBuffer<KeyT>      d_keys,
+    DoubleBuffer<ValueT>    d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    bool                    debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error            = cudaErrorNotSupported;
+#else
+    *d_cdp_error            = Dispatch(
+                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
+                                num_items, num_segments, d_segment_offsets,
+                                begin_bit, end_bit, 0, debug_synchronous);
+    *d_temp_storage_bytes   = temp_storage_bytes;
+    *d_selector             = d_keys.selector;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <int IS_DESCENDING, typename KeyT, typename ValueT>
+cudaError_t Dispatch(
+    Int2Type<IS_DESCENDING> is_descending,
+    Int2Type<CDP>           dispatch_to,
+    int                     *d_selector,
+    size_t                  *d_temp_storage_bytes,
+    cudaError_t             *d_cdp_error,
+
+    void                    *d_temp_storage,
+    size_t                  &temp_storage_bytes,
+    DoubleBuffer<KeyT>      &d_keys,
+    DoubleBuffer<ValueT>    &d_values,
+    int                     num_items,
+    int                     num_segments,
+    const int               *d_segment_offsets,
+    int                     begin_bit,
+    int                     end_bit,
+    cudaStream_t            stream,
+    bool                    debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, debug_synchronous);
+
+    // Copy out selector
+    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
+    d_values.selector = d_keys.selector;
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Simple key-value pairing
+ */
+template <
+    typename KeyT,
+    typename ValueT,
+    bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
+struct Pair
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (key < b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for bool types)
+ */
+template <typename ValueT>
+struct Pair<bool, ValueT, false>
+{
+    bool     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        return (!key && b.key);
+    }
+};
+
+
+/**
+ * Simple key-value pairing (specialized for floating point types)
+ */
+template <typename KeyT, typename ValueT>
+struct Pair<KeyT, ValueT, true>
+{
+    KeyT     key;
+    ValueT   value;
+
+    bool operator<(const Pair &b) const
+    {
+        if (key < b.key)
+            return true;
+
+        if (key > b.key)
+            return false;
+
+        // KeyT in unsigned bits
+        typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+        // Return true if key is negative zero and b.key is positive zero
+        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&key));
+        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&b.key));
+        UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
+
+        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
+    }
+};
+
+
+/**
+ * Initialize key data
+ */
+template <typename KeyT>
+void InitializeKeyBits(
+    GenMode         gen_mode,
+    KeyT            *h_keys,
+    int             num_items,
+    int             entropy_reduction)
+{
+    for (int i = 0; i < num_items; ++i)
+        InitValue(gen_mode, h_keys[i], i);
+}
+
+
+/**
+ * Initialize solution
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void InitializeSolution(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    int     *&h_reference_ranks,
+    KeyT    *&h_reference_keys)
+{
+    typedef Pair<KeyT, int> PairT;
+
+    PairT *h_pairs = new PairT[num_items];
+
+    int num_bits = end_bit - begin_bit;
+    for (int i = 0; i < num_items; ++i)
+    {
+
+        // Mask off unwanted portions
+        if (num_bits < sizeof(KeyT) * 8)
+        {
+            unsigned long long base = 0;
+            memcpy(&base, &h_keys[i], sizeof(KeyT));
+            base &= ((1ull << num_bits) - 1) << begin_bit;
+            memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
+        }
+        else
+        {
+            h_pairs[i].key = h_keys[i];
+        }
+
+        h_pairs[i].value = i;
+    }
+
+    printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
+
+    for (int i = 0; i < num_segments; ++i)
+    {
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
+    }
+
+    printf(" Done.\n"); fflush(stdout);
+
+    h_reference_ranks  = new int[num_items];
+    h_reference_keys   = new KeyT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+    {
+        h_reference_ranks[i]    = h_pairs[i].value;
+        h_reference_keys[i]     = h_keys[h_pairs[i].value];
+    }
+
+    if (h_pairs) delete[] h_pairs;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Test DeviceRadixSort
+ */
+template <
+    Backend     BACKEND,
+    bool        IS_DESCENDING,
+    typename    KeyT,
+    typename    ValueT>
+void Test(
+    KeyT        *h_keys,
+    ValueT      *h_values,
+    int         num_items,
+    int         num_segments,
+    int         *h_segment_offsets,
+    int         begin_bit,
+    int         end_bit,
+    KeyT        *h_reference_keys,
+    ValueT      *h_reference_values)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
+        (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (KEYS_ONLY) ? "keys-only" : "key-value",
+        num_items, num_segments,
+        (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
+        IS_DESCENDING, begin_bit, end_bit);
+    fflush(stdout);
+
+    if (g_verbose)
+    {
+        printf("Input keys:\n");
+        DisplayResults(h_keys, num_items);
+        printf("\n\n");
+    }
+
+    // Allocate device arrays
+    DoubleBuffer<KeyT>   d_keys;
+    DoubleBuffer<ValueT> d_values;
+    int                 *d_selector;
+    int                 *d_segment_offsets;
+    size_t              *d_temp_storage_bytes;
+    cudaError_t         *d_cdp_error;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
+    if (!KEYS_ONLY)
+    {
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
+    }
+
+    // Allocate temporary storage (and make it un-aligned)
+    size_t  temp_storage_bytes  = 0;
+    void    *d_temp_storage     = NULL;
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
+    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
+
+    // Initialize/clear device arrays
+    d_keys.selector = 0;
+    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
+    if (!KEYS_ONLY)
+    {
+        d_values.selector = 0;
+        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
+    }
+    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+        num_items, num_segments, d_segment_offsets,
+        begin_bit, end_bit, 0, true));
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Check for correctness (and display results, if specified)
+    printf("Warmup done.  Checking results:\n"); fflush(stdout);
+    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
+    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
+    if (!KEYS_ONLY)
+    {
+        int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
+        compare |= values_compare;
+        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+    if (BACKEND == CUB_NO_OVERWRITE)
+    {
+        // Check that input isn't overwritten
+        int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
+        compare |= input_compare;
+        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
+    }
+
+    // Performance
+    if (g_timing_iterations)
+        printf("\nPerforming timing iterations:\n"); fflush(stdout);
+
+    GpuTimer gpu_timer;
+    float elapsed_millis = 0.0f;
+    for (int i = 0; i < g_timing_iterations; ++i)
+    {
+        // Initialize/clear device arrays
+        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
+        if (!KEYS_ONLY)
+        {
+            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
+        }
+
+        gpu_timer.Start();
+        CubDebugExit(Dispatch(
+            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
+            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
+            num_items, num_segments, d_segment_offsets,
+            begin_bit, end_bit, 0, false));
+        gpu_timer.Stop();
+        elapsed_millis += gpu_timer.ElapsedMillis();
+    }
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = (KEYS_ONLY) ?
+            giga_rate * sizeof(KeyT) * 2 :
+            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
+        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
+    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
+    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
+    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
+    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test backend
+ */
+template <bool IS_DESCENDING, typename KeyT, typename ValueT>
+void TestBackend(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit,
+    KeyT    *h_reference_keys,
+    int     *h_reference_ranks)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    ValueT *h_values             = NULL;
+    ValueT *h_reference_values   = NULL;
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+
+    if (num_segments == 1)
+    {
+        // Test single-segment implementations
+        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+        Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+#ifdef CUB_CDP
+        Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+#endif
+    }
+
+    // Test multi-segment implementations
+    Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+    Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
+
+    if (h_values) delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+}
+
+
+
+
+/**
+ * Test value type
+ */
+template <bool IS_DESCENDING, typename KeyT>
+void TestValueTypes(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    // Initialize the solution
+
+    int *h_reference_ranks = NULL;
+    KeyT *h_reference_keys = NULL;
+    InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    // Test keys-only
+    TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 8b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 32b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with 64b value
+    TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Test with non-trivially-constructable value
+    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
+
+    // Cleanup
+    if (h_reference_ranks) delete[] h_reference_ranks;
+    if (h_reference_keys) delete[] h_reference_keys;
+}
+
+
+
+/**
+ * Test ascending/descending
+ */
+template <typename KeyT>
+void TestDirection(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    int     begin_bit,
+    int     end_bit)
+{
+    TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+    TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+}
+
+
+/**
+ * Test different bit ranges
+ */
+template <typename KeyT>
+void TestBits(
+    KeyT    *h_keys,
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets)
+{
+    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
+    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
+    {
+        // Partial bits
+        int begin_bit = 1;
+        int end_bit = (sizeof(KeyT) * 8) - 1;
+        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
+
+        // Across subword boundaries
+        int mid_bit = sizeof(KeyT) * 4;
+        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
+        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
+    }
+
+    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
+    TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
+}
+
+
+/**
+ * Test different segment compositions
+ */
+template <typename KeyT>
+void TestSegments(
+    KeyT    *h_keys,
+    int     num_items,
+    int     max_segments)
+{
+    int *h_segment_offsets = new int[max_segments + 1];
+
+    for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
+    {
+        if (num_items / num_segments < 128 * 1000) {
+            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+            InitializeSegments(num_items, num_segments, h_segment_offsets);
+            TestBits(h_keys, num_items, num_segments, h_segment_offsets);
+        }
+    }
+
+    // Test single segment
+    if (num_items < 128 * 1000) {
+        // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+        InitializeSegments(num_items, 1, h_segment_offsets);
+        TestBits(h_keys, num_items, 1, h_segment_offsets);
+    }
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/**
+ * Test different (sub)lengths and number of segments
+ */
+template <typename KeyT>
+void TestSizes(
+    KeyT    *h_keys,
+    int     max_items,
+    int     max_segments)
+{
+    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
+    {
+        TestSegments(h_keys, num_items, max_segments);
+    }
+    TestSegments(h_keys, 1, max_segments);
+    TestSegments(h_keys, 0, max_segments);
+}
+
+
+/**
+ * Test key sampling distributions
+ */
+template <typename KeyT>
+void TestGen(
+    int             max_items,
+    int             max_segments)
+{
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    if (max_items < 0)
+        max_items = (ptx_version > 100) ? 9000003 : max_items = 5000003;
+
+    if (max_segments < 0)
+        max_segments = 5003;
+
+    KeyT *h_keys = new KeyT[max_items];
+
+    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
+    {
+        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
+        InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
+        TestSizes(h_keys, max_items, max_segments);
+    }
+
+    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
+    InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
+    TestSizes(h_keys, max_items, max_segments);
+
+    if (h_keys) delete[] h_keys;
+}
+
+
+//---------------------------------------------------------------------
+// Simple test
+//---------------------------------------------------------------------
+
+template <
+    Backend     BACKEND,
+    typename    KeyT,
+    typename    ValueT,
+    bool        IS_DESCENDING>
+void Test(
+    int         num_items,
+    int         num_segments,
+    GenMode     gen_mode,
+    int         entropy_reduction,
+    int         begin_bit,
+    int         end_bit)
+{
+    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
+
+    KeyT    *h_keys             = new KeyT[num_items];
+    int     *h_reference_ranks  = NULL;
+    KeyT    *h_reference_keys   = NULL;
+    ValueT  *h_values           = NULL;
+    ValueT  *h_reference_values = NULL;
+    int     *h_segment_offsets  = new int[num_segments + 1];
+
+    if (end_bit < 0)
+        end_bit = sizeof(KeyT) * 8;
+
+    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
+    InitializeSegments(num_items, num_segments, h_segment_offsets);
+    InitializeSolution<IS_DESCENDING>(
+        h_keys, num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
+
+    if (!KEYS_ONLY)
+    {
+        h_values            = new ValueT[num_items];
+        h_reference_values  = new ValueT[num_items];
+
+        for (int i = 0; i < num_items; ++i)
+        {
+            InitValue(INTEGER_SEED, h_values[i], i);
+            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
+        }
+    }
+    if (h_reference_ranks) delete[] h_reference_ranks;
+
+    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
+    Test<BACKEND, IS_DESCENDING>(
+        h_keys, h_values,
+        num_items, num_segments, h_segment_offsets,
+        begin_bit, end_bit, h_reference_keys, h_reference_values);
+
+    if (h_keys)             delete[] h_keys;
+    if (h_reference_keys)   delete[] h_reference_keys;
+    if (h_values)           delete[] h_values;
+    if (h_reference_values) delete[] h_reference_values;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int bits = -1;
+    int num_items = -1;
+    int num_segments = -1;
+    int entropy_reduction = 0;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("s", num_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("bits", bits);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--bits=<valid key bits>]"
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--entropy=<entropy-reduction factor (default 0)>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    enum {
+        IS_DESCENDING   = false
+    };
+
+    // Compile/run basic CUB test
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+
+    Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(       num_items, num_segments,    RANDOM, entropy_reduction, 0, bits);
+
+    Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(       num_items, 1,               RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(       num_items, 1,               RANDOM, entropy_reduction, 0, bits);
+
+    Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(   num_items, 1,               RANDOM, entropy_reduction, 0, bits);
+    Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(   num_items, 1,               RANDOM, entropy_reduction, 0, bits);
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0)      num_items       = 48000000;
+    if (num_segments < 0)   num_segments    = 5000;
+
+    // Compare CUB and thrust on 32b keys-only
+    Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b keys-only
+    Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+    // Compare CUB and thrust on 32b key-value pairs
+    Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+    // Compare CUB and thrust on 64b key-value pairs
+    Test<CUB, unsigned long long, unsigned long long, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
+    Test<THRUST, unsigned long long, unsigned long long, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
+
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        TestGen<bool>                 (num_items, num_segments);
+
+        TestGen<char>                 (num_items, num_segments);
+        TestGen<signed char>          (num_items, num_segments);
+        TestGen<unsigned char>        (num_items, num_segments);
+
+        TestGen<short>                (num_items, num_segments);
+        TestGen<unsigned short>       (num_items, num_segments);
+
+        TestGen<int>                  (num_items, num_segments);
+        TestGen<unsigned int>         (num_items, num_segments);
+
+        TestGen<long>                 (num_items, num_segments);
+        TestGen<unsigned long>        (num_items, num_segments);
+
+        TestGen<long long>            (num_items, num_segments);
+        TestGen<unsigned long long>   (num_items, num_segments);
+
+        TestGen<float>                (num_items, num_segments);
+
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestGen<double>           (num_items, num_segments);
+
+    }
+
+#endif
+
+    return 0;
+}
+
diff --git a/external/cub/test/test_device_reduce.cu b/external/cub/test/test_device_reduce.cu
new file mode 100644
index 00000000000..26c663dab4c
--- /dev/null
+++ b/external/cub/test/test_device_reduce.cu
@@ -0,0 +1,1339 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <limits>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+int                     g_ptx_version;
+int                     g_sm_count;
+bool                    g_verbose           = false;
+bool                    g_verbose_input     = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+// Dispatch types
+enum Backend
+{
+    CUB,            // CUB method
+    CUB_SEGMENTED,  // CUB segmented method
+    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+    THRUST,         // Thrust method
+};
+
+
+// Custom max functor
+struct CustomMax
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename OutputT>
+    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Sum            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Min            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Max            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMin         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMax         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSegmentedReduce entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce entrypoint (custom-max)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Max-identity
+    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Sum            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to min entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Min            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to max entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::Max            reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmin entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMin         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+/**
+ * Dispatch to argmax entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_SEGMENTED>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    cub::ArgMax         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to device reduction directly
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
+            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
+            stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduction entrypoint (min or max specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT         reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        OutputT init;
+        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
+
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+/**
+ * Dispatch to reduction entrypoint (sum specialization)
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    Sum                 reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
+        OutputT retval;
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
+        }
+
+        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
+            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA nested-parallelism test kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceReduce
+ */
+template <
+    typename            InputIteratorT,
+    typename            OutputIteratorT,
+    typename            OffsetIteratorT,
+    typename            ReductionOpT>
+__global__ void CnpDispatchKernel(
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CUB_CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB_CDP>       dispatch_to,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    int                 num_items,
+    int                 max_segments,
+    OffsetIteratorT     d_segment_offsets,
+    ReductionOpT        reduction_op,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Initialize problem
+template <typename InputT>
+void Initialize(
+    GenMode         gen_mode,
+    InputT          *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose_input)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/// Solve problem (max/custom-max functor)
+template <typename ReductionOpT, typename InputT, typename _OutputT>
+struct Solution
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        ReductionOpT reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (min functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Min, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Min reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (sum functor)
+template <typename InputT, typename _OutputT>
+struct Solution<cub::Sum, InputT, _OutputT>
+{
+    typedef _OutputT OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::Sum reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate;
+            InitValue(INTEGER_SEED, aggregate, 0);
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+/// Solve problem (argmin functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMin, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMin reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+/// Solve problem (argmax functor)
+template <typename InputValueT, typename OutputValueT>
+struct Solution<cub::ArgMax, InputValueT, OutputValueT>
+{
+    typedef KeyValuePair<int, OutputValueT> OutputT;
+
+    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
+    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
+        cub::ArgMax reduction_op)
+    {
+        for (int i = 0; i < num_segments; ++i)
+        {
+            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
+            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
+            {
+                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
+                aggregate = reduction_op(aggregate, item);
+            }
+            h_reference[i] = aggregate;
+        }
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Problem generation
+//---------------------------------------------------------------------
+
+/// Test DeviceReduce for a given problem input
+template <
+    typename                BackendT,
+    typename                DeviceInputIteratorT,
+    typename                HostReferenceIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void Test(
+    BackendT                backend,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op,
+    HostReferenceIteratorT  h_reference)
+{
+    // Input and output data types
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputT;
+    typedef typename std::iterator_traits<HostReferenceIteratorT>::value_type   OutputT;
+
+    // Allocate CUB_CDP device arrays for temp storage size and error
+    OutputT         *d_out = NULL;
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,                 sizeof(OutputT) * num_segments));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Inquire temp device storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Allocate temp device storage
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
+
+    // Run once with discard iterator
+    DiscardOutputIterator<OffsetT> discard_itr;
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, discard_itr, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(backend, 1,
+        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+        d_in, d_out, num_items, num_segments, d_segment_offsets,
+        reduction_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    if (g_timing_iterations > 0)
+    {
+        GpuTimer gpu_timer;
+        gpu_timer.Start();
+
+        CubDebugExit(Dispatch(backend, g_timing_iterations,
+            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
+            d_in, d_out, num_items, num_segments, d_segment_offsets,
+            reduction_op, 0, false));
+
+        gpu_timer.Stop();
+        float elapsed_millis = gpu_timer.ElapsedMillis();
+
+        // Display performance
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * sizeof(InputT);
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/// Test DeviceReduce
+template <
+    Backend                 BACKEND,
+    typename                OutputValueT,
+    typename                HostInputIteratorT,
+    typename                DeviceInputIteratorT,
+    typename                OffsetT,
+    typename                OffsetIteratorT,
+    typename                ReductionOpT>
+void SolveAndTest(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets,
+    ReductionOpT            reduction_op)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
+    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
+    typedef typename SolutionT::OutputT                                         OutputT;
+
+    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
+        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
+        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
+    fflush(stdout);
+
+    // Allocate and solve solution
+    OutputT *h_reference = new OutputT[num_segments];
+    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
+
+    // Run test
+    Test(Int2Type<BACKEND>(), d_in, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/// Test specific problem type
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        OffsetT,
+    typename        ReductionOpT>
+void TestProblem(
+    OffsetT         num_items,
+    OffsetT         num_segments,
+    GenMode         gen_mode,
+    ReductionOpT    reduction_op)
+{
+    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+    fflush(stdout);
+
+    // Initialize value data
+    InputT* h_in = new InputT[num_items];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize segment data
+    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
+    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+
+    // Initialize device data
+    OffsetT *d_segment_offsets      = NULL;
+    InputT  *d_in                   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
+
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+    if (h_in)               delete[] h_in;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/// Test different operators
+template <
+    Backend             BACKEND,
+    typename            OutputT,
+    typename            HostInputIteratorT,
+    typename            DeviceInputIteratorT,
+    typename            OffsetT,
+    typename            OffsetIteratorT>
+void TestByOp(
+    HostInputIteratorT      h_in,
+    DeviceInputIteratorT    d_in,
+    OffsetT                 num_items,
+    OffsetT                 num_segments,
+    OffsetIteratorT         h_segment_offsets,
+    OffsetIteratorT         d_segment_offsets)
+{
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
+    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
+}
+
+
+/// Test different backends
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestByBackend(
+    OffsetT     num_items,
+    OffsetT     max_segments,
+    GenMode     gen_mode)
+{
+    // Initialize host data
+    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
+        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
+
+    InputT  *h_in               = new InputT[num_items];
+    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
+    Initialize(gen_mode, h_in, num_items);
+
+    // Initialize device data
+    InputT  *d_in               = NULL;
+    OffsetT *d_segment_offsets  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    //
+    // Test single-segment implementations
+    //
+
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    // Page-aligned-input tests
+    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
+#ifdef CUB_CDP
+    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
+#endif
+
+    // Non-page-aligned-input tests
+    if (num_items > 1)
+    {
+        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
+        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
+    }
+
+    //
+    // Test segmented implementation
+    //
+
+    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
+    int max_items_per_segment = 128000;
+
+    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
+        num_segments < max_segments;
+        num_segments = (num_segments * 32) + 1)
+    {
+        // Test with segment pointer
+        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
+        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
+
+        // Test with segment iterator
+        typedef CastOp<OffsetT> IdentityOpT;
+        IdentityOpT identity_op;
+        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
+            h_segment_offsets,
+            identity_op);
+       TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
+            d_segment_offsets,
+            identity_op);
+
+        TestByOp<CUB_SEGMENTED, OutputT>(
+            h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
+    }
+
+    if (h_in)               delete[] h_in;
+    if (h_segment_offsets)  delete[] h_segment_offsets;
+    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
+}
+
+
+/// Test different input-generation modes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+void TestByGenMode(
+    OffsetT num_items,
+    OffsetT max_segments)
+{
+    //
+    // Test pointer support using different input-generation modes
+    //
+
+    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
+    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
+
+    //
+    // Test iterator support using a constant-iterator and SUM
+    //
+
+    InputT val;
+    InitValue(UNIFORM, val, 0);
+    ConstantInputIterator<InputT, OffsetT> h_in(val);
+
+    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
+    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
+
+    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#ifdef CUB_CDP
+    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
+#endif
+
+    if (h_segment_offsets) delete[] h_segment_offsets;
+}
+
+
+/// Test different problem sizes
+template <
+    typename InputT,
+    typename OutputT,
+    typename OffsetT>
+struct TestBySize
+{
+    OffsetT max_items;
+    OffsetT max_segments;
+
+    TestBySize(OffsetT max_items, OffsetT max_segments) :
+        max_items(max_items),
+        max_segments(max_segments)
+    {}
+
+    template <typename ActivePolicyT>
+    cudaError_t Invoke()
+    {
+        //
+        // Black-box testing on all backends
+        //
+
+        // Test 0, 1, many
+        TestByGenMode<InputT, OutputT>(0,           max_segments);
+        TestByGenMode<InputT, OutputT>(1,           max_segments);
+        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
+
+        // Test random problem sizes from a log-distribution [8, max_items-ish)
+        int     num_iterations = 8;
+        double  max_exp = log(double(max_items)) / log(double(2.0));
+        for (int i = 0; i < num_iterations; ++i)
+        {
+            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
+            TestByGenMode<InputT, OutputT>(num_items, max_segments);
+        }
+
+        //
+        // White-box testing of single-segment problems around specific sizes
+        //
+
+        // Tile-boundaries: multiple blocks, one tile per block
+        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
+
+        // Tile-boundaries: multiple blocks, multiple tiles per block
+        OffsetT sm_occupancy = 32;
+        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
+        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
+        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
+
+        return cudaSuccess;
+    }
+};
+
+
+/// Test problem type
+template <
+    typename    InputT,
+    typename    OutputT,
+    typename    OffsetT>
+void TestType(
+    OffsetT     max_items,
+    OffsetT     max_segments)
+{
+    typedef typename DeviceReducePolicy<OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
+
+    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
+
+    MaxPolicyT::Invoke(g_ptx_version, dispatch);
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    typedef int OffsetT;
+
+    OffsetT max_items       = 27000000;
+    OffsetT max_segments    = 34000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_verbose_input = args.CheckCmdLineFlag("v2");
+    args.GetCmdLineArgument("n", max_items);
+    args.GetCmdLineArgument("s", max_segments);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--s=<num segments> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    CubDebugExit(PtxVersion(g_ptx_version));
+
+    // Get SM count
+    g_sm_count = args.deviceProp.multiProcessorCount;
+
+    std::numeric_limits<float>::max();
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic test
+
+
+
+    TestProblem<CUB, int, int>(     max_items, 1, RANDOM, Sum());
+
+    TestProblem<CUB, char, int>(    max_items, 1, RANDOM, Sum());
+
+    TestProblem<CUB, int, int>(     max_items, 1, RANDOM, ArgMax());
+
+    TestProblem<CUB, float, float>( max_items, 1, RANDOM, Sum());
+
+    TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM, Sum());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick comparison tests
+
+    TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
+    TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
+    TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
+    TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
+
+    printf("\n----------------------------\n");
+    TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
+    TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestType<char, char>(max_items, max_segments);
+
+        TestType<unsigned char, unsigned char>(max_items, max_segments);
+
+        TestType<char, int>(max_items, max_segments);
+
+        TestType<short, short>(max_items, max_segments);
+        TestType<int, int>(max_items, max_segments);
+        TestType<long, long>(max_items, max_segments);
+        TestType<long long, long long>(max_items, max_segments);
+
+        TestType<uchar2, uchar2>(max_items, max_segments);
+        TestType<uint2, uint2>(max_items, max_segments);
+        TestType<ulonglong2, ulonglong2>(max_items, max_segments);
+        TestType<ulonglong4, ulonglong4>(max_items, max_segments);
+
+        TestType<TestFoo, TestFoo>(max_items, max_segments);
+        TestType<TestBar, TestBar>(max_items, max_segments);
+
+    }
+
+#endif
+
+
+    printf("\n");
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_device_reduce_by_key.cu b/external/cub/test/test_device_reduce_by_key.cu
new file mode 100644
index 00000000000..4d9c4726949
--- /dev/null
+++ b/external/cub/test/test_device_reduce_by_key.cu
@@ -0,0 +1,853 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::ReduceByKey utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                  equality_op,
+    ReductionOpT                 reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceReduce::ReduceByKey(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_keys_out,
+            d_values_in,
+            d_values_out,
+            d_num_runs,
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to reduce-by-key entrypoint
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input keys type
+    typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
+        typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
+        typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
+        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
+
+        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
+        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
+
+        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_keys_in_wrapper,
+                d_keys_in_wrapper + num_items,
+                d_values_in_wrapper,
+                d_keys_out_wrapper,
+                d_values_out_wrapper);
+        }
+
+        OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    typename                    KeyInputIteratorT,
+    typename                    KeyOutputIteratorT,
+    typename                    ValueInputIteratorT,
+    typename                    ValueOutputIteratorT,
+    typename                    NumRunsIteratorT,
+    typename                    EqualityOpT,
+    typename                    ReductionOpT,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    KeyInputIteratorT           d_keys_in,
+    KeyOutputIteratorT          d_keys_out,
+    ValueInputIteratorT         d_values_in,
+    ValueOutputIteratorT        d_values_out,
+    NumRunsIteratorT            d_num_runs,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences
+
+        int repeat;
+
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    typename        KeyInputIteratorT,
+    typename        ValueInputIteratorT,
+    typename        KeyT,
+    typename        ValueT,
+    typename        EqualityOpT,
+    typename        ReductionOpT>
+int Solve(
+    KeyInputIteratorT       h_keys_in,
+    KeyT                    *h_keys_reference,
+    ValueInputIteratorT     h_values_in,
+    ValueT                  *h_values_reference,
+    EqualityOpT             equality_op,
+    ReductionOpT            reduction_op,
+    int                     num_items)
+{
+    // First item
+    KeyT previous        = h_keys_in[0];
+    ValueT aggregate     = h_values_in[0];
+    int num_segments    = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_keys_in[i]))
+        {
+            h_keys_reference[num_segments] = previous;
+            h_values_reference[num_segments] = aggregate;
+            num_segments++;
+            aggregate = h_values_in[i];
+        }
+        else
+        {
+            aggregate = reduction_op(aggregate, h_values_in[i]);
+        }
+        previous = h_keys_in[i];
+    }
+
+    h_keys_reference[num_segments] = previous;
+    h_values_reference[num_segments] = aggregate;
+    num_segments++;
+
+    return num_segments;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceKeyInputIteratorT,
+    typename            DeviceValueInputIteratorT,
+    typename            KeyT,
+    typename            ValueT,
+    typename            EqualityOpT,
+    typename            ReductionOpT>
+void Test(
+    DeviceKeyInputIteratorT     d_keys_in,
+    DeviceValueInputIteratorT   d_values_in,
+    KeyT*                       h_keys_reference,
+    ValueT*                     h_values_reference,
+    EqualityOpT                 equality_op,
+    ReductionOpT                reduction_op,
+    int                         num_segments,
+    int                         num_items)
+{
+    // Allocate device output arrays and number of segments
+    KeyT*   d_keys_out             = NULL;
+    ValueT* d_values_out           = NULL;
+    int*    d_num_runs         = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
+    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
+    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
+    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
+
+    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis  = elapsed_millis / g_timing_iterations;
+        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
+        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
+    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT* h_values_in        = new ValueT[num_items];
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_values_in[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    ValueT   *d_values_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_values_in) delete[] h_values_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestIterator(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment,
+    ReductionOpT    reduction_op)
+{
+    // Allocate host arrays
+    KeyT* h_keys_in        = new KeyT[num_items];
+    KeyT* h_keys_reference = new KeyT[num_items];
+
+    ValueT one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<ValueT, int> h_values_in(one_val);
+    ValueT* h_values_reference = new ValueT[num_items];
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
+    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
+
+    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
+        num_items, num_segments, float(num_items) / num_segments,
+        typeid(KeyT).name(), typeid(ValueT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    KeyT     *d_keys_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
+
+    // Cleanup
+    if (h_keys_in) delete[] h_keys_in;
+    if (h_keys_reference) delete[] h_keys_reference;
+    if (h_values_reference) delete[] h_values_reference;
+    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op,
+    int             max_segment)
+{
+    // 0 key-bit entropy reduction rounds
+    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
+
+    if (max_segment > 1)
+    {
+        // 2 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
+
+        // 7 key-bit entropy reduction rounds
+        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
+    }
+}
+
+
+/**
+ * Test different avg segment lengths modes
+ */
+template <
+    Backend         BACKEND,
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void Test(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
+    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
+
+    // Evaluate different max-segment lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
+    {
+        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
+    }
+}
+
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestDispatch(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
+#ifdef CUB_CDP
+    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        KeyT,
+    typename        ValueT,
+    typename        ReductionOpT>
+void TestSize(
+    int             num_items,
+    ReductionOpT    reduction_op)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<KeyT, ValueT>(1,        reduction_op);
+        TestDispatch<KeyT, ValueT>(100,      reduction_op);
+        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
+        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
+    }
+    else
+    {
+        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
+    }
+
+}
+
+
+template <
+    typename        KeyT,
+    typename        ValueT>
+void TestOp(
+    int             num_items)
+{
+    TestSize<KeyT, ValueT>(num_items, cub::Sum());
+    TestSize<KeyT, ValueT>(num_items, cub::Max());
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("---- RLE int ---- \n");
+    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- RLE long long ---- \n");
+    TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- int ---- \n");
+    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    printf("---- float ---- \n");
+    TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+    TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
+
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+    {
+        printf("---- double ---- \n");
+        TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+        TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
+    }
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+
+        // Test different input types
+        TestOp<int, char>(num_items);
+        TestOp<int, short>(num_items);
+        TestOp<int, int>(num_items);
+        TestOp<int, long>(num_items);
+        TestOp<int, long long>(num_items);
+        TestOp<int, float>(num_items);
+        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+            TestOp<int, double>(num_items);
+
+        TestOp<int, uchar2>(num_items);
+        TestOp<int, uint2>(num_items);
+        TestOp<int, uint3>(num_items);
+        TestOp<int, uint4>(num_items);
+        TestOp<int, ulonglong4>(num_items);
+        TestOp<int, TestFoo>(num_items);
+        TestOp<int, TestBar>(num_items);
+
+        TestOp<char, int>(num_items);
+        TestOp<long long, int>(num_items);
+        TestOp<TestFoo, int>(num_items);
+        TestOp<TestBar, int>(num_items);
+
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_device_run_length_encode.cu b/external/cub/test/test_device_run_length_encode.cu
new file mode 100644
index 00000000000..0be20ce2189
--- /dev/null
+++ b/external/cub/test/test_device_run_length_encode.cu
@@ -0,0 +1,890 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceReduce::RunLengthEncode utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/device/device_reduce.cuh>
+#include <cub/device/device_run_length_encode.cuh>
+#include <cub/thread/thread_operators.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+// Operation types
+enum RleMethod
+{
+    RLE,                // Run length encode
+    NON_TRIVIAL,
+    CSR,
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE>               method,
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::Encode(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to non-trivial runs entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<NON_TRIVIAL>       method,
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceRunLengthEncode::NonTrivialRuns(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+    return error;
+}
+
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to run-length encode entrypoint
+ */
+template <
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    OffsetT>
+cudaError_t Dispatch(
+    Int2Type<RLE>               method,
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
+        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
+
+        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
+
+        LengthT one_val;
+        InitValue(INTEGER_SEED, one_val, 1);
+        thrust::constant_iterator<LengthT> constant_one(one_val);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_ends = thrust::reduce_by_key(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                constant_one,
+                d_unique_out_wrapper,
+                d_lengths_out_wrapper);
+        }
+
+        OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceRunLengthEncode
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+__global__ void CnpDispatchKernel(
+    Int2Type<RLE_METHOD>            method,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    cub::Equality               equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <
+    int                         RLE_METHOD,
+    typename                    InputIteratorT,
+    typename                    UniqueOutputIteratorT,
+    typename                    OffsetsOutputIteratorT,
+    typename                    LengthsOutputIteratorT,
+    typename                    NumRunsIterator,
+    typename                    EqualityOp,
+    typename                    OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<RLE_METHOD>        method,
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    UniqueOutputIteratorT       d_unique_out,
+    OffsetsOutputIteratorT      d_offsets_out,
+    LengthsOutputIteratorT      d_lengths_out,
+    NumRunsIterator             d_num_runs,
+    EqualityOp                  equality_op,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve problem.  Returns total number of segments identified
+ */
+template <
+    RleMethod       RLE_METHOD,
+    typename        InputIteratorT,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT,
+    typename        EqualityOp>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_unique_reference,
+    OffsetT         *h_offsets_reference,
+    LengthT         *h_lengths_reference,
+    EqualityOp      equality_op,
+    int             num_items)
+{
+    if (num_items == 0) 
+        return 0;
+
+    // First item
+    T       previous        = h_in[0];
+    LengthT  length          = 1;
+    int     num_runs        = 0;
+    int     run_begin       = 0;
+
+    // Subsequent items
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (!equality_op(previous, h_in[i]))
+        {
+            if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+            {
+                h_unique_reference[num_runs]      = previous;
+                h_offsets_reference[num_runs]     = run_begin;
+                h_lengths_reference[num_runs]     = length;
+                num_runs++;
+            }
+            length = 1;
+            run_begin = i;
+        }
+        else
+        {
+            length++;
+        }
+        previous = h_in[i];
+    }
+
+    if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
+    {
+        h_unique_reference[num_runs]    = previous;
+        h_offsets_reference[num_runs]   = run_begin;
+        h_lengths_reference[num_runs]   = length;
+        num_runs++;
+    }
+
+    return num_runs;
+}
+
+
+
+/**
+ * Test DeviceRunLengthEncode for a given problem input
+ */
+template <
+    RleMethod           RLE_METHOD,
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T,
+    typename            OffsetT,
+    typename            LengthT,
+    typename            EqualityOp>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_unique_reference,
+    OffsetT             *h_offsets_reference,
+    LengthT             *h_lengths_reference,
+    EqualityOp          equality_op,
+    int                 num_runs,
+    int                 num_items)
+{
+    // Allocate device output arrays and number of segments
+    T*          d_unique_out       = NULL;
+    LengthT*    d_offsets_out      = NULL;
+    OffsetT*    d_lengths_out      = NULL;
+    int*        d_num_runs         = NULL;
+
+    if (RLE_METHOD == RLE)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*          d_temp_storage_bytes = NULL;
+    cudaError_t*     d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void*           d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output arrays
+    if (RLE_METHOD == RLE)
+        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
+    if (RLE_METHOD == NON_TRIVIAL)
+        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
+    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
+    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare0 = 0;
+    int compare1 = 0;
+    int compare2 = 0;
+    int compare3 = 0;
+
+    if (RLE_METHOD == RLE)
+    {
+        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
+        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != RLE)
+    {
+        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
+        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
+    }
+
+    if (RLE_METHOD != CSR)
+    {
+        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
+        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
+    }
+
+    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
+    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
+        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
+    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
+    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
+    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
+}
+
+
+/**
+ * Test DeviceRunLengthEncode on pointer type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*      h_in                    = new T[num_items];
+    T*      h_unique_reference      = new T[num_items];
+    OffsetT* h_offsets_reference     = new OffsetT[num_items];
+    LengthT* h_lengths_reference     = new LengthT[num_items];
+
+    for (int i = 0; i < num_items; ++i)
+        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
+        max_segment, entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T* d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             num_items,
+    Int2Type<true>  is_primitive)
+{
+    // Allocate host arrays
+    T* h_unique_reference       = new T[num_items];
+    OffsetT* h_offsets_reference = new OffsetT[num_items];
+    LengthT* h_lengths_reference = new LengthT[num_items];
+
+    T one_val;
+    InitValue(INTEGER_SEED, one_val, 1);
+    ConstantInputIterator<T, int> h_in(one_val);
+
+    // Initialize problem and solution
+    Equality equality_op;
+    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
+
+    printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
+        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_runs, float(num_items) / num_runs,
+        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
+    fflush(stdout);
+
+    // Run Test
+    Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
+
+    // Cleanup
+    if (h_unique_reference) delete[] h_unique_reference;
+    if (h_offsets_reference) delete[] h_offsets_reference;
+    if (h_lengths_reference) delete[] h_lengths_reference;
+}
+
+
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestIterator(
+    int             num_items,
+    Int2Type<false> is_primitive)
+{}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    RleMethod       RLE_METHOD,
+    Backend         BACKEND,
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void Test(
+    int             num_items)
+{
+    // Test iterator (one run)
+    TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
+
+    // num_items runs
+    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
+
+    // Evaluate different run lengths
+    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
+    {
+        // Uniform selection run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
+
+        // Reduced-entropy run length
+        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestDispatch(
+    int             num_items)
+{
+    Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
+
+#ifdef CUB_CDP
+    Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
+    Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename        T,
+    typename        OffsetT,
+    typename        LengthT>
+void TestSize(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestDispatch<T, OffsetT, LengthT>(0);
+        TestDispatch<T, OffsetT, LengthT>(1);
+        TestDispatch<T, OffsetT, LengthT>(100);
+        TestDispatch<T, OffsetT, LengthT>(10000);
+        TestDispatch<T, OffsetT, LengthT>(1000000);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestDispatch<T, OffsetT, LengthT>(num_items);
+        }
+    }
+    else
+    {
+        TestDispatch<T, OffsetT, LengthT>(num_items);
+    }
+
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int max_segment              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", max_segment);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    printf("\n");
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
+
+
+#elif defined(QUICK_TEST)
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
+    TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        TestSize<char,          int, int>(num_items);
+        TestSize<short,         int, int>(num_items);
+        TestSize<int,           int, int>(num_items);
+        TestSize<long,          int, int>(num_items);
+        TestSize<long long,     int, int>(num_items);
+        TestSize<float,         int, int>(num_items);
+        TestSize<double,        int, int>(num_items);
+
+        TestSize<uchar2,        int, int>(num_items);
+        TestSize<uint2,         int, int>(num_items);
+        TestSize<uint3,         int, int>(num_items);
+        TestSize<uint4,         int, int>(num_items);
+        TestSize<ulonglong4,    int, int>(num_items);
+        TestSize<TestFoo,       int, int>(num_items);
+        TestSize<TestBar,       int, int>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_device_scan.cu b/external/cub/test/test_device_scan.cu
new file mode 100644
index 00000000000..63c80682ced
--- /dev/null
+++ b/external/cub/test/test_device_scan.cu
@@ -0,0 +1,1015 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/scan.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/discard_output_iterator.cuh>
+#include <cub/device/device_scan.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose           = false;
+int                     g_timing_iterations = 0;
+int                     g_repeat            = 0;
+double                  g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceScan entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>       dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to exclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to exclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive scan entrypoint
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to inclusive sum entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<THRUST>    dispatch_to,
+    Int2Type<true>      is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    Sum                 scan_op,
+    NullType            initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceScan
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t              temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    bool                debug_synchronous)
+{
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(
+        Int2Type<CUB>(),
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        debug_synchronous);
+
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>       dispatch_to,
+    IsPrimitiveT        is_primitive,
+    int                 timing_timing_iterations,
+    size_t              *d_temp_storage_bytes,
+    cudaError_t         *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t&             temp_storage_bytes,
+    InputIteratorT      d_in,
+    OutputIteratorT     d_out,
+    ScanOpT             scan_op,
+    InitialValueT       initial_value,
+    OffsetT             num_items,
+    cudaStream_t        stream,
+    bool                debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(
+        is_primitive,
+        timing_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode      gen_mode,
+    T            *h_in,
+    int          num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+/**
+ * Solve exclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    OutputT         initial_value)
+{
+    if (num_items > 0)
+    {
+        OutputT val         = h_in[0];
+        h_reference[0]      = initial_value;
+        OutputT inclusive   = scan_op(initial_value, val);
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            val = h_in[i];
+            h_reference[i] = inclusive;
+            inclusive = scan_op(inclusive, val);
+        }
+    }
+}
+
+
+/**
+ * Solve inclusive-scan problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        OutputT,
+    typename        ScanOpT>
+void Solve(
+    InputIteratorT  h_in,
+    OutputT         *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    NullType)
+{
+    if (num_items > 0)
+    {
+        OutputT inclusive   = h_in[0];
+        h_reference[0]      = inclusive;
+
+        for (int i = 1; i < num_items; ++i)
+        {
+            OutputT val = h_in[i];
+            inclusive = scan_op(inclusive, val);
+            h_reference[i] = inclusive;
+        }
+    }
+}
+
+
+/**
+ * Test DeviceScan for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            OutputT,
+    typename            ScanOpT,
+    typename            InitialValueT>
+void Test(
+    DeviceInputIteratorT    d_in,
+    OutputT                 *h_reference,
+    int                     num_items,
+    ScanOpT                 scan_op,
+    InitialValueT           initial_value)
+{
+    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
+
+    // Allocate device output array
+    OutputT *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(
+        Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        1,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        true));
+
+    // Check for correctness (and display results, if specified)
+    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
+    printf("\t%s", compare ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
+        Int2Type<Traits<OutputT>::PRIMITIVE>(),
+        g_timing_iterations,
+        d_temp_storage_bytes,
+        d_cdp_error,
+        d_temp_storage,
+        temp_storage_bytes,
+        d_in,
+        d_out,
+        scan_op,
+        initial_value,
+        num_items,
+        0,
+        false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis = elapsed_millis / g_timing_iterations;
+        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+
+    printf("\n\n");
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare);
+}
+
+
+/**
+ * Test DeviceScan on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestPointer(
+    int             num_items,
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
+        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
+    fflush(stdout);
+
+    // Allocate host arrays
+    InputT*     h_in        = new InputT[num_items];
+    OutputT*    h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Initialize(gen_mode, h_in, num_items);
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Allocate problem device arrays
+    InputT *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceScan on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void TestIterator(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
+        num_items,
+        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
+    fflush(stdout);
+
+    // Use a constant iterator as the input
+    InputT val = InputT();
+    ConstantInputIterator<InputT, int> h_in(val);
+
+    // Allocate host arrays
+    OutputT*  h_reference = new OutputT[num_items];
+
+    // Initialize problem and solution
+    Solve(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
+    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
+    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        InputT,
+    typename        OutputT,
+    typename        ScanOpT,
+    typename        InitialValueT>
+void Test(
+    int             num_items,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
+#ifdef CUB_CDP
+    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
+#endif
+}
+
+
+/**
+ * Test different operators
+ */
+template <typename InputT, typename OutputT>
+void TestOp(
+    int             num_items,
+    OutputT         identity,
+    OutputT         initial_value)
+{
+    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
+    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
+    Test<InputT, OutputT>(num_items, cub::Max(), identity);
+
+    // Exclusive (non-specialized, so we can test initial-value)
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
+    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
+
+    // Inclusive (no initial value)
+    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
+    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <
+    typename InputT,
+    typename OutputT>
+void TestSize(
+    int     num_items,
+    OutputT identity,
+    OutputT initial_value)
+{
+    if (num_items < 0)
+    {
+        TestOp<InputT>(0,        identity, initial_value);
+        TestOp<InputT>(1,        identity, initial_value);
+        TestOp<InputT>(100,      identity, initial_value);
+        TestOp<InputT>(10000,    identity, initial_value);
+        TestOp<InputT>(1000000,  identity, initial_value);
+
+        // Randomly select problem size between 1:10,000,000
+        unsigned int max_int = (unsigned int) -1;
+        for (int i = 0; i < 10; ++i)
+        {
+            unsigned int num_items;
+            RandomBits(num_items);
+            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
+            num_items = CUB_MAX(1, num_items);
+            TestOp<InputT>(num_items,  identity, initial_value);
+        }
+    }
+    else
+    {
+        TestOp<InputT>(num_items, identity, initial_value);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, int>(         num_items    , UNIFORM, Sum(), (int) (0));
+    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    TestPointer<CUB, char, char>(        num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+    TestPointer<THRUST, char, char>(     num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short, short>(       num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+    TestPointer<THRUST, short, short>(    num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
+    TestPointer<THRUST, int, int>(      num_items    , UNIFORM, Sum(), (int) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long, long long>(   num_items / 2, UNIFORM, Sum(), (long long) (0));
+    TestPointer<THRUST, long long, long long>(num_items / 2, UNIFORM, Sum(), (long long) (0));
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestBar, TestBar>(     num_items / 4, UNIFORM, Sum(), TestBar());
+    TestPointer<THRUST, TestBar, TestBar>(  num_items / 4, UNIFORM, Sum(), TestBar());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input+output data types
+        TestSize<unsigned char>(num_items,      (int) 0, (int) 99);
+
+        // Test same intput+output data types
+        TestSize<unsigned char>(num_items,      (unsigned char) 0,      (unsigned char) 99);
+        TestSize<char>(num_items,               (char) 0,               (char) 99);
+        TestSize<unsigned short>(num_items,     (unsigned short) 0,     (unsigned short)99);
+        TestSize<unsigned int>(num_items,       (unsigned int) 0,       (unsigned int) 99);
+        TestSize<unsigned long long>(num_items, (unsigned long long) 0, (unsigned long long) 99);
+
+        TestSize<uchar2>(num_items,     make_uchar2(0, 0),              make_uchar2(17, 21));
+        TestSize<char2>(num_items,      make_char2(0, 0),               make_char2(17, 21));
+        TestSize<ushort2>(num_items,    make_ushort2(0, 0),             make_ushort2(17, 21));
+        TestSize<uint2>(num_items,      make_uint2(0, 0),               make_uint2(17, 21));
+        TestSize<ulonglong2>(num_items, make_ulonglong2(0, 0),          make_ulonglong2(17, 21));
+        TestSize<uchar4>(num_items,     make_uchar4(0, 0, 0, 0),        make_uchar4(17, 21, 32, 85));
+        TestSize<char4>(num_items,      make_char4(0, 0, 0, 0),         make_char4(17, 21, 32, 85));
+
+        TestSize<ushort4>(num_items,    make_ushort4(0, 0, 0, 0),       make_ushort4(17, 21, 32, 85));
+        TestSize<uint4>(num_items,      make_uint4(0, 0, 0, 0),         make_uint4(17, 21, 32, 85));
+        TestSize<ulonglong4>(num_items, make_ulonglong4(0, 0, 0, 0),    make_ulonglong4(17, 21, 32, 85));
+
+        TestSize<TestFoo>(num_items,
+            TestFoo::MakeTestFoo(0, 0, 0, 0),
+            TestFoo::MakeTestFoo(1ll << 63, 1 << 31, short(1 << 15), char(1 << 7)));
+
+        TestSize<TestBar>(num_items,
+            TestBar(0, 0),
+            TestBar(1ll << 63, 1 << 31));
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_device_select_if.cu b/external/cub/test/test_device_select_if.cu
new file mode 100644
index 00000000000..a02b020f336
--- /dev/null
+++ b/external/cub/test/test_device_select_if.cu
@@ -0,0 +1,1039 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::If and DevicePartition::If utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+#include <thrust/partition.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+// Selection functor type
+template <typename T>
+struct LessThan
+{
+    T compare;
+
+    __host__ __device__ __forceinline__
+    LessThan(T compare) : compare(compare) {}
+
+    __host__ __device__ __forceinline__
+    bool operator()(const T &a) const {
+        return (a < compare);
+    }
+};
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<false>             partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<true>              partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+/**
+ * Dispatch to select if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>         d_out_wrapper_end;
+        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition if entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<false>             is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
+
+        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_out_wrapper,
+                d_out_unselected,
+                select_op);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to select flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<false>             is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT>     d_out_wrapper_end;
+        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Dispatch to partition flagged entrypoint
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    Int2Type<true>              is_flagged,
+    Int2Type<true>              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The flag type
+    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
+
+        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
+        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
+
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::partition_copy(
+                d_in_wrapper,
+                d_in_wrapper + num_items,
+                d_flags_wrapper,
+                d_out_wrapper,
+                d_out_unselected,
+                CastOp<bool>());
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+    }
+
+    return cudaSuccess;
+}
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+__global__ void CnpDispatchKernel(
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    IsFlaggedTag                is_flagged,
+    IsPartitionTag              is_partition,
+    int                         timing_timing_iterations,
+    size_t*                     d_temp_storage_bytes,
+    cudaError_t*                d_cdp_error,
+
+    void*                       d_temp_storage,
+    size_t&                     temp_storage_bytes,
+    InputIteratorT              d_in,
+    FlagIteratorT               d_flags,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    SelectOpT                   select_op,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    T*  h_in,
+    int num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        // Initialize each item to a randomly selected value from [0..126]
+        unsigned int value;
+        RandomBits(value, 0, 0, 7);
+        if (value == 127)
+            value = 126;
+        InitValue(INTEGER_SEED, h_in[i], value);
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve selection problem (and set corresponding flags)
+ */
+template <
+    typename        InputIteratorT,
+    typename        FlagIteratorT,
+    typename        SelectOpT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    SelectOpT       select_op,
+    T*              h_reference,
+    FlagIteratorT   h_flags,
+    int             num_items)
+{
+    int num_selected = 0;
+    for (int i = 0; i < num_items; ++i)
+    {
+        if ((h_flags[i] = select_op(h_in[i])))
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+        else
+        {
+            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    bool                IS_FLAGGED,
+    bool                IS_PARTITION,
+    typename            DeviceInputIteratorT,
+    typename            FlagT,
+    typename            SelectOpT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT    d_in,
+    FlagT*                  h_flags,
+    SelectOpT               select_op,
+    T*                      h_reference,
+    int                     num_selected,
+    int                     num_items)
+{
+    // Allocate device flags, output, and num-selected
+    FlagT*      d_flags = NULL;
+    T*          d_out = NULL;
+    int*        d_num_selected_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t*         d_temp_storage_bytes = NULL;
+    cudaError_t*    d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Copy flags and clear device output array
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = (IS_PARTITION) ?
+        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
+        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float   avg_millis          = elapsed_millis / g_timing_iterations;
+        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
+        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
+        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
+        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
+
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test on pointer type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_in        = new T[num_items];
+    FlagT*  h_flags     = new FlagT[num_items];
+    T*      h_reference = new T[num_items];
+
+    // Initialize input
+    Initialize(h_in, num_items);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test on iterator type
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void TestIterator(
+    int             num_items,
+    float           select_ratio)
+{
+    typedef char FlagT;
+
+    // Allocate host arrays
+    T*      h_reference = new T[num_items];
+    FlagT*  h_flags = new FlagT[num_items];
+
+    // Use counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Select a comparison value that is select_ratio through the space of [0,127]
+    T compare;
+    if (select_ratio <= 0.0)
+        InitValue(INTEGER_SEED, compare, 0);        // select none
+    else if (select_ratio >= 1.0)
+        InitValue(INTEGER_SEED, compare, 127);      // select all
+    else
+        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
+
+    LessThan<T> select_op(compare);
+    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
+
+    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
+    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
+        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
+        (IS_FLAGGED) ? "Flagged" : "If",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+    if (h_flags) delete[] h_flags;
+}
+
+
+/**
+ * Test different selection ratios
+ */
+template <
+    Backend         BACKEND,
+    bool            IS_FLAGGED,
+    bool            IS_PARTITION,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
+    {
+        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
+    }
+}
+
+
+/**
+ * Test (select vs. partition) and (flagged vs. functor)
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestMethod(
+    int             num_items)
+{
+    // Functor
+    Test<BACKEND, false, false, T>(num_items);
+    Test<BACKEND, false, true, T>(num_items);
+
+    // Flagged
+    Test<BACKEND, true, false, T>(num_items);
+    Test<BACKEND, true, true, T>(num_items);
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    TestMethod<CUB, T>(num_items);
+#ifdef CUB_CDP
+    TestMethod<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+/**
+ * Test select/partition on pointer types
+ */
+template <typename T>
+void ComparePointer(
+    int             num_items,
+    float           select_ratio)
+{
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, false, true, T>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, false, T>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, T>(num_items, select_ratio);
+    TestPointer<THRUST, true, true, T>(num_items, select_ratio);
+
+}
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    float select_ratio      = 0.5;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("ratio", select_ratio);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--ratio=<selection ratio, default 0.5>] "
+            "[--repeat=<repetitions of entire test suite>] "
+            "[--v] "
+            "[--cdp] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Select-if ----------------------------\n");
+    TestPointer<CUB, false, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-if ----------------------------\n");
+    TestPointer<CUB, false, true, int>(num_items, select_ratio);
+
+    printf("-- Select-flagged ----------------------------\n");
+    TestPointer<CUB, true, false, int>(num_items, select_ratio);
+
+    printf("-- Partition-flagged ----------------------------\n");
+    TestPointer<CUB, true, true, int>(num_items, select_ratio);
+
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, false, false, int>(num_items, select_ratio);
+
+    ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
+    ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
+    ComparePointer<int>(        num_items,                                  select_ratio);
+    ComparePointer<long long>(  num_items / 2,                              select_ratio);
+    ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_device_select_unique.cu b/external/cub/test/test_device_select_unique.cu
new file mode 100644
index 00000000000..bd40a5c0eb9
--- /dev/null
+++ b/external/cub/test/test_device_select_unique.cu
@@ -0,0 +1,651 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of DeviceSelect::Unique utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include <cub/util_allocator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/device/device_select.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/unique.h>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose               = false;
+int                     g_timing_iterations     = 0;
+int                     g_repeat                = 0;
+float                   g_device_giga_bandwidth;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+//---------------------------------------------------------------------
+// Dispatch to different CUB DeviceSelect entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t Dispatch(
+    Int2Type<CUB>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    cudaError_t error = cudaSuccess;
+    for (int i = 0; i < timing_timing_iterations; ++i)
+    {
+        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
+    }
+    return error;
+}
+
+
+//---------------------------------------------------------------------
+// Dispatch to different Thrust entrypoints
+//---------------------------------------------------------------------
+
+
+/**
+ * Dispatch to unique entrypoint
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__host__ __forceinline__
+cudaError_t Dispatch(
+    Int2Type<THRUST>            dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void                        *d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT             d_out,
+    NumSelectedIteratorT        d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    if (d_temp_storage == 0)
+    {
+        temp_storage_bytes = 1;
+    }
+    else
+    {
+        thrust::device_ptr<OutputT> d_out_wrapper_end;
+        thrust::device_ptr<InputT> d_in_wrapper(d_in);
+        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
+        for (int i = 0; i < timing_timing_iterations; ++i)
+        {
+            d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
+        }
+
+        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
+        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
+
+    }
+
+    return cudaSuccess;
+}
+
+
+
+//---------------------------------------------------------------------
+// CUDA Nested Parallelism Test Kernel
+//---------------------------------------------------------------------
+
+/**
+ * Simple wrapper kernel to invoke DeviceSelect
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+__global__ void CnpDispatchKernel(
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    bool                        debug_synchronous)
+{
+
+#ifndef CUB_CDP
+    *d_cdp_error = cudaErrorNotSupported;
+#else
+    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
+    *d_temp_storage_bytes = temp_storage_bytes;
+#endif
+}
+
+
+/**
+ * Dispatch to CDP kernel
+ */
+template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
+cudaError_t Dispatch(
+    Int2Type<CDP>               dispatch_to,
+    int                         timing_timing_iterations,
+    size_t                      *d_temp_storage_bytes,
+    cudaError_t                 *d_cdp_error,
+
+    void*               d_temp_storage,
+    size_t                      &temp_storage_bytes,
+    InputIteratorT              d_in,
+    OutputIteratorT              d_out,
+    NumSelectedIteratorT         d_num_selected_out,
+    OffsetT                     num_items,
+    cudaStream_t                stream,
+    bool                        debug_synchronous)
+{
+    // Invoke kernel to invoke device-side dispatch
+    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
+        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
+
+    // Copy out temp_storage_bytes
+    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
+
+    // Copy out error
+    cudaError_t retval;
+    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Test generation
+//---------------------------------------------------------------------
+
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    int         entropy_reduction,
+    T           *h_in,
+    int         num_items,
+    int         max_segment)
+{
+    unsigned int max_int = (unsigned int) -1;
+
+    int key = 0;
+    int i = 0;
+    while (i < num_items)
+    {
+        // Select number of repeating occurrences for the current run
+        int repeat;
+        if (max_segment < 0)
+        {
+            repeat = num_items;
+        }
+        else if (max_segment < 2)
+        {
+            repeat = 1;
+        }
+        else
+        {
+            RandomBits(repeat, entropy_reduction);
+            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
+            repeat = CUB_MAX(1, repeat);
+        }
+
+        int j = i;
+        while (j < CUB_MIN(i + repeat, num_items))
+        {
+            InitValue(INTEGER_SEED, h_in[j], key);
+            j++;
+        }
+
+        i = j;
+        key++;
+    }
+
+    if (g_verbose)
+    {
+        printf("Input:\n");
+        DisplayResults(h_in, num_items);
+        printf("\n\n");
+    }
+}
+
+
+/**
+ * Solve unique problem
+ */
+template <
+    typename        InputIteratorT,
+    typename        T>
+int Solve(
+    InputIteratorT  h_in,
+    T               *h_reference,
+    int             num_items)
+{
+    int num_selected = 0;
+    if (num_items > 0)
+    {
+        h_reference[num_selected] = h_in[0];
+        num_selected++;
+    }
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        if (h_in[i] != h_in[i - 1])
+        {
+            h_reference[num_selected] = h_in[i];
+            num_selected++;
+        }
+    }
+
+    return num_selected;
+}
+
+
+
+/**
+ * Test DeviceSelect for a given problem input
+ */
+template <
+    Backend             BACKEND,
+    typename            DeviceInputIteratorT,
+    typename            T>
+void Test(
+    DeviceInputIteratorT d_in,
+    T                   *h_reference,
+    int                 num_selected,
+    int                 num_items)
+{
+    // Allocate device output array and num selected
+    T       *d_out            = NULL;
+    int     *d_num_selected_out   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
+
+    // Allocate CDP device arrays
+    size_t          *d_temp_storage_bytes = NULL;
+    cudaError_t     *d_cdp_error = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
+
+    // Allocate temporary storage
+    void            *d_temp_storage = NULL;
+    size_t          temp_storage_bytes = 0;
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+    // Clear device output array
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
+    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
+
+    // Run warmup/correctness iteration
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
+
+    // Check for correctness (and display results, if specified)
+    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
+    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
+
+    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
+    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Performance
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
+    gpu_timer.Stop();
+    float elapsed_millis = gpu_timer.ElapsedMillis();
+
+    // Display performance
+    if (g_timing_iterations > 0)
+    {
+        float avg_millis        = elapsed_millis / g_timing_iterations;
+        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
+        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
+        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
+    }
+    printf("\n\n");
+
+    // Flush any stdout/stderr
+    fflush(stdout);
+    fflush(stderr);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
+    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
+    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
+    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+
+    // Correctness asserts
+    AssertEquals(0, compare1 | compare2);
+}
+
+
+/**
+ * Test DeviceSelect on pointer type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestPointer(
+    int             num_items,
+    int             entropy_reduction,
+    int             max_segment)
+{
+    // Allocate host arrays
+    T*  h_in        = new T[num_items];
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    Initialize(entropy_reduction, h_in, num_items, max_segment);
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T),
+        entropy_reduction);
+    fflush(stdout);
+
+    // Allocate problem device arrays
+    T *d_in = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
+
+    // Initialize device input
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
+
+    // Run Test
+    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+}
+
+
+/**
+ * Test DeviceSelect on iterator type
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void TestIterator(
+    int             num_items)
+{
+    // Use a counting iterator as the input
+    CountingInputIterator<T, int> h_in(0);
+
+    // Allocate host arrays
+    T*  h_reference = new T[num_items];
+
+    // Initialize problem and solution
+    int num_selected = Solve(h_in, h_reference, num_items);
+
+    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
+        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
+        num_items, num_selected, float(num_items) / num_selected,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run Test
+    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
+
+    // Cleanup
+    if (h_reference) delete[] h_reference;
+}
+
+
+/**
+ * Test different gen modes
+ */
+template <
+    Backend         BACKEND,
+    typename        T>
+void Test(
+    int             num_items)
+{
+    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
+    {
+        TestPointer<BACKEND, T>(num_items, 0, max_segment);
+        TestPointer<BACKEND, T>(num_items, 2, max_segment);
+        TestPointer<BACKEND, T>(num_items, 7, max_segment);
+    }
+}
+
+
+/**
+ * Test different dispatch
+ */
+template <
+    typename        T>
+void TestOp(
+    int             num_items)
+{
+    Test<CUB, T>(num_items);
+#ifdef CUB_CDP
+    Test<CDP, T>(num_items);
+#endif
+}
+
+
+/**
+ * Test different input sizes
+ */
+template <typename T>
+void Test(
+    int             num_items)
+{
+    if (num_items < 0)
+    {
+        TestOp<T>(0);
+        TestOp<T>(1);
+        TestOp<T>(100);
+        TestOp<T>(10000);
+        TestOp<T>(1000000);
+    }
+    else
+    {
+        TestOp<T>(num_items);
+    }
+}
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    int num_items           = -1;
+    int entropy_reduction   = 0;
+    int maxseg              = 1000;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("n", num_items);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    args.GetCmdLineArgument("repeat", g_repeat);
+    args.GetCmdLineArgument("maxseg", maxseg);
+    args.GetCmdLineArgument("entropy", entropy_reduction);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--n=<input items> "
+            "[--i=<timing iterations> "
+            "[--device=<device-id>] "
+            "[--maxseg=<max segment length>]"
+            "[--entropy=<segment length bit entropy reduction rounds>]"
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "[--cdp]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+    g_device_giga_bandwidth = args.device_giga_bandwidth;
+    printf("\n");
+
+#ifdef QUICKER_TEST
+
+    // Compile/run basic CUB test
+    if (num_items < 0) num_items = 32000000;
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+
+#elif defined(QUICK_TEST)
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Compile/run quick tests
+    if (num_items < 0) num_items = 32000000;
+
+    printf("-- Iterator ----------------------------\n");
+    TestIterator<CUB, int>(        num_items);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, char>(        num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+    TestPointer<THRUST, char>(     num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, short>(       num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+    TestPointer<THRUST, short>(    num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
+    TestPointer<THRUST, int>(      num_items,                                 entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, long long>(   num_items / 2,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, long long>(num_items / 2,                             entropy_reduction, maxseg);
+
+    printf("----------------------------\n");
+    TestPointer<CUB, TestFoo>(     num_items / 4,                             entropy_reduction, maxseg);
+    TestPointer<THRUST, TestFoo>(  num_items / 4,                             entropy_reduction, maxseg);
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test different input types
+        Test<unsigned char>(num_items);
+        Test<unsigned short>(num_items);
+        Test<unsigned int>(num_items);
+        Test<unsigned long long>(num_items);
+
+        Test<uchar2>(num_items);
+        Test<ushort2>(num_items);
+        Test<uint2>(num_items);
+        Test<ulonglong2>(num_items);
+
+        Test<uchar4>(num_items);
+        Test<ushort4>(num_items);
+        Test<uint4>(num_items);
+        Test<ulonglong4>(num_items);
+
+        Test<TestFoo>(num_items);
+        Test<TestBar>(num_items);
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_grid_barrier.cu b/external/cub/test/test_grid_barrier.cu
new file mode 100644
index 00000000000..24a0e3ce2d8
--- /dev/null
+++ b/external/cub/test/test_grid_barrier.cu
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test evaluation for software global barrier throughput
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+
+#include <cub/grid/grid_barrier.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Kernel that iterates through the specified number of software global barriers
+ */
+__global__ void Kernel(
+    GridBarrier global_barrier,
+    int iterations)
+{
+    for (int i = 0; i < iterations; i++)
+    {
+        global_barrier.Sync();
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    cudaError_t retval = cudaSuccess;
+
+    // Defaults
+    int iterations = 10000;
+    int block_size = 128;
+    int grid_size = -1;
+
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+
+    // Get args
+    args.GetCmdLineArgument("i", iterations);
+    args.GetCmdLineArgument("grid-size", grid_size);
+    args.GetCmdLineArgument("block-size", block_size);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>]"
+            "[--i=<iterations>]"
+            "[--grid-size<grid-size>]"
+            "[--block-size<block-size>]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get device SM version
+    int sm_version;
+    CubDebugExit(SmVersion(sm_version, device_ordinal));
+
+    // Get SM properties
+    int sm_count, max_block_threads, max_sm_occupancy;
+    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
+    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
+    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
+
+    // Compute grid size and occupancy
+    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
+
+    if (grid_size == -1)
+    {
+        grid_size = occupancy * sm_count;
+    }
+    else
+    {
+        occupancy = grid_size / sm_count;
+    }
+
+    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
+        grid_size, block_size, occupancy);
+    fflush(stdout);
+
+    // Init global barrier
+    GridBarrierLifetime global_barrier;
+    global_barrier.Setup(grid_size);
+
+    // Time kernel
+    GpuTimer gpu_timer;
+    gpu_timer.Start();
+    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
+    gpu_timer.Stop();
+
+    retval = CubDebug(cudaThreadSynchronize());
+
+    // Output timing results
+    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
+    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
+        iterations,
+        gpu_timer.ElapsedMillis(),
+        avg_elapsed);
+
+    return retval;
+}
diff --git a/external/cub/test/test_iterator.cu b/external/cub/test/test_iterator.cu
new file mode 100644
index 00000000000..fbcbdd2d904
--- /dev/null
+++ b/external/cub/test/test_iterator.cu
@@ -0,0 +1,805 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <iterator>
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/iterator/arg_index_input_iterator.cuh>
+#include <cub/iterator/cache_modified_input_iterator.cuh>
+#include <cub/iterator/cache_modified_output_iterator.cuh>
+#include <cub/iterator/constant_input_iterator.cuh>
+#include <cub/iterator/counting_input_iterator.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <cub/iterator/transform_input_iterator.cuh>
+
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/copy.h>
+
+using namespace cub;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+// Dispatch types
+enum Backend
+{
+    CUB,        // CUB method
+    THRUST,     // Thrust method
+    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
+};
+
+
+template <typename T>
+struct TransformOp
+{
+    // Increment transform
+    __host__ __device__ __forceinline__ T operator()(T input) const
+    {
+        T addend;
+        InitValue(INTEGER_SEED, addend, 1);
+        return input + addend;
+    }
+};
+
+struct SelectOp
+{
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(T input)
+    {
+        return true;
+    }
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
+}
+
+
+/**
+ * Test constant iterator
+ */
+template <typename T>
+void TestConstant(T base)
+{
+    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    T h_reference[8] = {base, base, base, base, base, base, base, base};
+    ConstantInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    int copy_items  = 100;
+    T   *h_copy     = new T[copy_items];
+    T   *d_copy     = NULL;
+
+    for (int i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test counting iterator
+ */
+template <typename T>
+void TestCounting(T base)
+{
+    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = base + 0;          // Value at offset 0
+    h_reference[1] = base + 100;        // Value at offset 100
+    h_reference[2] = base + 1000;       // Value at offset 1000
+    h_reference[3] = base + 10000;      // Value at offset 10000
+    h_reference[4] = base + 1;          // Value at offset 1
+    h_reference[5] = base + 21;         // Value at offset 21
+    h_reference[6] = base + 11;         // Value at offset 11
+    h_reference[7] = base + 0;          // Value at offset 0;
+
+    CountingInputIterator<T> d_itr(base);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    unsigned long long  max_items   = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
+    size_t  copy_items              = (size_t) CUB_MIN(max_items - base, 100);     // potential issue with differencing overflows when T is a smaller type than can handle the offset
+    T                   *h_copy     = new T[copy_items];
+    T                   *d_copy     = NULL;
+
+    for (unsigned long long i = 0; i < copy_items; ++i)
+        h_copy[i] = d_itr[i];
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+}
+
+
+/**
+ * Test modified iterator
+ */
+template <typename T, typename CastT>
+void TestModified()
+{
+    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
+    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+
+    CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
+    CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);
+
+    thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test transform iterator
+ */
+template <typename T, typename CastT>
+void TestTransform()
+{
+    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
+    Test(d_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+
+/**
+ * Test tex-obj texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexObj()
+{
+    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind obj-based test iterator
+    TexObjInputIterator<T> d_obj_itr;
+    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    Test(d_obj_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    CubDebugExit(d_obj_itr.UnbindTexture());
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+#if CUDA_VERSION >= 5050
+
+/**
+ * Test tex-ref texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexRef()
+{
+    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES          = 11000;
+    const unsigned int DUMMY_OFFSET         = 500;
+    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind ref-based test iterator
+    TexRefInputIterator<T, __LINE__> d_ref_itr;
+    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create and bind dummy iterator of same type to check with interferance
+    TexRefInputIterator<T, __LINE__> d_ref_itr2;
+    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+
+    Test(d_ref_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
+    thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_ref_itr.UnbindTexture());
+    CubDebugExit(d_ref_itr2.UnbindTexture());
+
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
+}
+
+
+/**
+ * Test texture transform iterator
+ */
+template <typename T, typename CastT>
+void TestTexTransform()
+{
+    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    const unsigned int TEST_VALUES = 11000;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        InitValue(INTEGER_SEED, h_data[i], i);
+    }
+
+    // Allocate device arrays
+    T *d_data = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    TransformOp<T> op;
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = op(h_data[0]);          // Value at offset 0
+    h_reference[1] = op(h_data[100]);        // Value at offset 100
+    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
+    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
+    h_reference[4] = op(h_data[1]);          // Value at offset 1
+    h_reference[5] = op(h_data[21]);         // Value at offset 21
+    h_reference[6] = op(h_data[11]);         // Value at offset 11
+    h_reference[7] = op(h_data[0]);          // Value at offset 0;
+
+    // Create and bind texture iterator
+    typedef TexRefInputIterator<T, __LINE__> TextureIterator;
+
+    TextureIterator d_tex_itr;
+    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create transform iterator
+    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
+
+    Test(xform_itr, h_reference);
+
+#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
+
+    //
+    // Test with thrust::copy_if()
+    //
+
+    T *h_copy = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+        h_copy[i] = op(h_data[i]);
+
+    T *d_copy = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
+    thrust::device_ptr<T> d_copy_wrapper(d_copy);
+
+    thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
+
+    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (h_copy) delete[] h_copy;
+    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
+
+#endif  // THRUST_VERSION
+
+    CubDebugExit(d_tex_itr.UnbindTexture());
+    if (h_data) delete[] h_data;
+    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
+}
+
+#endif  // CUDA_VERSION
+
+
+
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<false> is_integer)
+{
+    TestModified<T, CastT>();
+    TestTransform<T, CastT>();
+
+#if CUB_CDP
+    // Test tex-obj iterators if CUDA dynamic parallelism enabled
+    TestTexObj<T, CastT>(type_string);
+#endif  // CUB_CDP
+
+#if CUDA_VERSION >= 5050
+    // Test tex-ref iterators for CUDA 5.5
+    TestTexRef<T, CastT>();
+    TestTexTransform<T, CastT>();
+#endif  // CUDA_VERSION
+}
+
+/**
+ * Run integer tests
+ */
+template <typename T, typename CastT>
+void Test(Int2Type<true> is_integer)
+{
+    TestConstant<T>(0);
+    TestConstant<T>(99);
+
+    TestCounting<T>(0);
+    TestCounting<T>(99);
+
+    // Run non-integer tests
+    Test<T, CastT>(Int2Type<false>());
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    enum {
+        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
+    };
+
+    // Test non-const type
+    Test<T, T>(Int2Type<IS_INTEGER>());
+
+    // Test non-const type
+    Test<T, const T>(Int2Type<IS_INTEGER>());
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // Evaluate different data types
+    Test<char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
+        Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n"); fflush(stdout);
+
+    return 0;
+}
+
+
+
diff --git a/external/cub/test/test_util.h b/external/cub/test/test_util.h
new file mode 100644
index 00000000000..621726214e2
--- /dev/null
+++ b/external/cub/test/test_util.h
@@ -0,0 +1,1600 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+#pragma once
+
+#if defined(_WIN32) || defined(_WIN64)
+    #include <windows.h>
+    #undef small            // Windows is terrible for polluting macro namespace
+#else
+    #include <sys/resource.h>
+#endif
+
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iostream>
+#include <limits>
+
+#include "mersenne.h"
+
+#include "cub/util_debug.cuh"
+#include "cub/util_device.cuh"
+#include "cub/util_type.cuh"
+#include "cub/util_macro.cuh"
+
+/******************************************************************************
+ * Assertion macros
+ ******************************************************************************/
+
+/**
+ * Assert equals
+ */
+#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
+
+
+/******************************************************************************
+ * Command-line parsing functionality
+ ******************************************************************************/
+
+/**
+ * Utility for parsing command line arguments
+ */
+struct CommandLineArgs
+{
+
+    std::vector<std::string>    keys;
+    std::vector<std::string>    values;
+    std::vector<std::string>    args;
+    cudaDeviceProp              deviceProp;
+    float                       device_giga_bandwidth;
+    size_t                      device_free_physmem;
+    size_t                      device_total_physmem;
+
+    /**
+     * Constructor
+     */
+    CommandLineArgs(int argc, char **argv) :
+        keys(10),
+        values(10)
+    {
+        using namespace std;
+
+        // Initialize mersenne generator
+        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
+        mersenne::init_by_array(mersenne_init, 4);
+
+        for (int i = 1; i < argc; i++)
+        {
+            string arg = argv[i];
+
+            if ((arg[0] != '-') || (arg[1] != '-'))
+            {
+                args.push_back(arg);
+                continue;
+            }
+
+            string::size_type pos;
+            string key, val;
+            if ((pos = arg.find('=')) == string::npos) {
+                key = string(arg, 2, arg.length() - 2);
+                val = "";
+            } else {
+                key = string(arg, 2, pos - 2);
+                val = string(arg, pos + 1, arg.length() - 1);
+            }
+
+            keys.push_back(key);
+            values.push_back(val);
+        }
+    }
+
+
+    /**
+     * Checks whether a flag "--<flag>" is present in the commandline
+     */
+    bool CheckCmdLineFlag(const char* arg_name)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+                return true;
+        }
+        return false;
+    }
+
+
+    /**
+     * Returns number of naked (non-flag and non-key-value) commandline parameters
+     */
+    template <typename T>
+    int NumNakedArgs()
+    {
+        return args.size();
+    }
+
+
+    /**
+     * Returns the commandline parameter for a given index (not including flags)
+     */
+    template <typename T>
+    void GetCmdLineArgument(int index, T &val)
+    {
+        using namespace std;
+        if (index < args.size()) {
+            istringstream str_stream(args[index]);
+            str_stream >> val;
+        }
+    }
+
+    /**
+     * Returns the value specified for a given commandline parameter --<flag>=<value>
+     */
+    template <typename T>
+    void GetCmdLineArgument(const char *arg_name, T &val)
+    {
+        using namespace std;
+
+        for (int i = 0; i < int(keys.size()); ++i)
+        {
+            if (keys[i] == string(arg_name))
+            {
+                istringstream str_stream(values[i]);
+                str_stream >> val;
+            }
+        }
+    }
+
+
+    /**
+     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
+     */
+    template <typename T>
+    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
+    {
+        using namespace std;
+
+        if (CheckCmdLineFlag(arg_name))
+        {
+            // Clear any default values
+            vals.clear();
+
+            // Recover from multi-value string
+            for (int i = 0; i < keys.size(); ++i)
+            {
+                if (keys[i] == string(arg_name))
+                {
+                    string val_string(values[i]);
+                    istringstream str_stream(val_string);
+                    string::size_type old_pos = 0;
+                    string::size_type new_pos = 0;
+
+                    // Iterate comma-separated values
+                    T val;
+                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
+                    {
+                        if (new_pos != old_pos)
+                        {
+                            str_stream.width(new_pos - old_pos);
+                            str_stream >> val;
+                            vals.push_back(val);
+                        }
+
+                        // skip over comma
+                        str_stream.ignore(1);
+                        old_pos = new_pos + 1;
+                    }
+
+                    // Read last value
+                    str_stream >> val;
+                    vals.push_back(val);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * The number of pairs parsed
+     */
+    int ParsedArgc()
+    {
+        return (int) keys.size();
+    }
+
+    /**
+     * Initialize device
+     */
+    cudaError_t DeviceInit(int dev = -1)
+    {
+        cudaError_t error = cudaSuccess;
+
+        do
+        {
+            int deviceCount;
+            error = CubDebug(cudaGetDeviceCount(&deviceCount));
+            if (error) break;
+
+            if (deviceCount == 0) {
+                fprintf(stderr, "No devices supporting CUDA.\n");
+                exit(1);
+            }
+            if (dev < 0)
+            {
+                GetCmdLineArgument("device", dev);
+            }
+            if ((dev > deviceCount - 1) || (dev < 0))
+            {
+                dev = 0;
+            }
+
+            error = CubDebug(cudaSetDevice(dev));
+            if (error) break;
+
+            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
+
+            int ptx_version;
+            error = CubDebug(cub::PtxVersion(ptx_version));
+            if (error) break;
+
+            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
+            if (error) break;
+
+            if (deviceProp.major < 1) {
+                fprintf(stderr, "Device does not support CUDA.\n");
+                exit(1);
+            }
+
+            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
+
+            if (!CheckCmdLineFlag("quiet"))
+            {
+                printf(
+                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
+                        "%lld free / %lld total MB physmem, "
+                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
+                    dev,
+                    deviceProp.name,
+                    ptx_version,
+                    deviceProp.major * 100 + deviceProp.minor * 10,
+                    deviceProp.multiProcessorCount,
+                    (unsigned long long) device_free_physmem / 1024 / 1024,
+                    (unsigned long long) device_total_physmem / 1024 / 1024,
+                    device_giga_bandwidth,
+                    deviceProp.memoryClockRate,
+                    (deviceProp.ECCEnabled) ? "on" : "off");
+                fflush(stdout);
+            }
+
+        } while (0);
+
+        return error;
+    }
+};
+
+/******************************************************************************
+ * Random bits generator
+ ******************************************************************************/
+
+int g_num_rand_samples = 0;
+
+
+template <typename T>
+bool IsNaN(T val) { return false; }
+
+template<>
+__noinline__ bool IsNaN<float>(float val)
+{
+    volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);
+
+    return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) || 
+        ((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
+}
+
+template<>
+__noinline__ bool IsNaN<float1>(float1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float2>(float2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float3>(float3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<float4>(float4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+template<>
+__noinline__ bool IsNaN<double>(double val)
+{
+    volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);
+
+    return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) || 
+        ((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
+}
+
+template<>
+__noinline__ bool IsNaN<double1>(double1 val)
+{
+    return (IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double2>(double2 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double3>(double3 val)
+{
+    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
+}
+
+template<>
+__noinline__ bool IsNaN<double4>(double4 val)
+{
+    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
+}
+
+
+/**
+ * Generates random keys.
+ *
+ * We always take the second-order byte from rand() because the higher-order
+ * bits returned by rand() are commonly considered more uniformly distributed
+ * than the lower-order bits.
+ *
+ * We can decrease the entropy level of keys by adopting the technique
+ * of Thearling and Smith in which keys are computed from the bitwise AND of
+ * multiple random samples:
+ *
+ * entropy_reduction    | Effectively-unique bits per key
+ * -----------------------------------------------------
+ * -1                   | 0
+ * 0                    | 32
+ * 1                    | 25.95 (81%)
+ * 2                    | 17.41 (54%)
+ * 3                    | 10.78 (34%)
+ * 4                    | 6.42 (20%)
+ * ...                  | ...
+ *
+ */
+template <typename K>
+void RandomBits(
+    K &key,
+    int entropy_reduction = 0,
+    int begin_bit = 0,
+    int end_bit = sizeof(K) * 8)
+{
+    const int NUM_BYTES = sizeof(K);
+    const int WORD_BYTES = sizeof(unsigned int);
+    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
+
+    unsigned int word_buff[NUM_WORDS];
+
+    if (entropy_reduction == -1)
+    {
+        memset((void *) &key, 0, sizeof(key));
+        return;
+    }
+
+    if (end_bit < 0)
+        end_bit = sizeof(K) * 8;
+
+    while (true) 
+    {
+        // Generate random word_buff
+        for (int j = 0; j < NUM_WORDS; j++)
+        {
+            int current_bit = j * WORD_BYTES * 8;
+
+            unsigned int word = 0xffffffff;
+            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
+            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
+
+            for (int i = 0; i <= entropy_reduction; i++)
+            {
+                // Grab some of the higher bits from rand (better entropy, supposedly)
+                word &= mersenne::genrand_int32();
+                g_num_rand_samples++;                
+            }
+
+            word_buff[j] = word;
+        }
+
+        memcpy(&key, word_buff, sizeof(K));
+
+        K copy = key;
+        if (!IsNaN(copy))
+            break;          // avoids NaNs when generating random floating point numbers
+    }
+}
+
+/// Randomly select number between [0:max)
+template <typename T>
+T RandomValue(T max)
+{
+    unsigned int bits;
+    unsigned int max_int = (unsigned int) -1;
+    do {
+        RandomBits(bits);
+    } while (bits == max_int);
+
+    return (T) ((double(bits) / double(max_int)) * double(max));
+}
+
+
+/******************************************************************************
+ * Console printing utilities
+ ******************************************************************************/
+
+/**
+ * Helper for casting character types to integers for cout printing
+ */
+template <typename T>
+T CoutCast(T val) { return val; }
+
+int CoutCast(char val) { return val; }
+
+int CoutCast(unsigned char val) { return val; }
+
+int CoutCast(signed char val) { return val; }
+
+
+
+/******************************************************************************
+ * Test value initialization utilities
+ ******************************************************************************/
+
+/**
+ * Test problem generation options
+ */
+enum GenMode
+{
+    UNIFORM,            // Assign to '2', regardless of integer seed
+    INTEGER_SEED,       // Assign to integer seed
+    RANDOM,             // Assign to random, regardless of integer seed
+};
+
+/**
+ * Initialize value
+ */
+template <typename T>
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+         RandomBits(value);
+         break;
+#endif
+     case UNIFORM:
+        value = 2;
+        break;
+    case INTEGER_SEED:
+    default:
+         value = (T) index;
+        break;
+    }
+}
+
+
+/**
+ * Initialize value (bool)
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
+{
+    switch (gen_mode)
+    {
+#if (CUB_PTX_ARCH == 0)
+    case RANDOM:
+        char c;
+        RandomBits(c, 0, 0, 1);
+        value = (c > 0);
+        break;
+#endif
+     case UNIFORM:
+        value = true;
+        break;
+    case INTEGER_SEED:
+    default:
+        value = (index > 0);
+        break;
+    }
+}
+
+
+/**
+ * cub::NullType test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, cub::NullType &value, int index = 0)
+{}
+
+
+/**
+ * cub::KeyValuePair<OffsetT, ValueT>test initialization
+ */
+template <typename KeyT, typename ValueT>
+__host__ __device__ __forceinline__ void InitValue(
+    GenMode                             gen_mode,
+    cub::KeyValuePair<KeyT, ValueT>&    value,
+    int                                 index = 0)
+{
+    InitValue(gen_mode, value.value, index);
+
+    // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
+    RandomBits(value.key, 3);
+    value.key = (value.key & 0x1);
+}
+
+
+
+/******************************************************************************
+ * Comparison and ostream operators
+ ******************************************************************************/
+
+/**
+ * KeyValuePair ostream operator
+ */
+template <typename Key, typename Value>
+std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
+{
+    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
+    return os;
+}
+
+
+/******************************************************************************
+ * Comparison and ostream operators for CUDA vector types
+ ******************************************************************************/
+
+/**
+ * Vector1 overloads
+ */
+#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '(' << CoutCast(val.x) << ')';                \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x);                                \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x);                                \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x > b.x);                                 \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x < b.x);                                 \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(a.x + b.x);                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace std */
+
+
+
+/**
+ * Vector2 overloads
+ */
+#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        return a.y > b.y;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        return a.y < b.y;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                         \
+        T b)                                         \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+
+/**
+ * Vector3 overloads
+ */
+#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        return a.z > b.z;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        return a.z < b.z;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+
+/**
+ * Vector4 overloads
+ */
+#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
+    /* Ostream output */                                    \
+    std::ostream& operator<<(                               \
+        std::ostream& os,                                   \
+        const T& val)                                       \
+    {                                                       \
+        os << '('                                           \
+            << CoutCast(val.x) << ','                       \
+            << CoutCast(val.y) << ','                       \
+            << CoutCast(val.z) << ','                       \
+            << CoutCast(val.w) << ')';                      \
+        return os;                                          \
+    }                                                       \
+    /* Inequality */                                        \
+    __host__ __device__ __forceinline__ bool operator!=(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x != b.x) ||                              \
+            (a.y != b.y) ||                                 \
+            (a.z != b.z) ||                                 \
+            (a.w != b.w);                                   \
+    }                                                       \
+    /* Equality */                                          \
+    __host__ __device__ __forceinline__ bool operator==(    \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        return (a.x == b.x) &&                              \
+            (a.y == b.y) &&                                 \
+            (a.z == b.z) &&                                 \
+            (a.w == b.w);                                   \
+    }                                                       \
+    /* Test initialization */                               \
+    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
+    {                                                       \
+        InitValue(gen_mode, value.x, index);                \
+        InitValue(gen_mode, value.y, index);                \
+        InitValue(gen_mode, value.z, index);                \
+        InitValue(gen_mode, value.w, index);                \
+    }                                                       \
+    /* Max */                                               \
+    __host__ __device__ __forceinline__ bool operator>(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
+        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
+        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
+        return a.w > b.w;                                               \
+    }                                                       \
+    /* Min */                                               \
+    __host__ __device__ __forceinline__ bool operator<(     \
+        const T &a,                                         \
+        const T &b)                                         \
+    {                                                       \
+        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
+        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
+        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
+        return a.w < b.w;                                               \
+    }                                                       \
+    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
+    __host__ __device__ __forceinline__ T operator+(        \
+        T a,                                                \
+        T b)                                                \
+    {                                                       \
+        T retval = make_##T(                                        \
+            a.x + b.x,                                      \
+            a.y + b.y,                                      \
+            a.z + b.z,                                      \
+            a.w + b.w);                                     \
+        return retval;                                      \
+    }                                                       \
+    namespace cub {                                         \
+    template<>                                              \
+    struct NumericTraits<T>                                 \
+    {                                                       \
+        static const Category CATEGORY = NOT_A_NUMBER;      \
+        enum {                                              \
+            PRIMITIVE       = false,                        \
+            NULL_TYPE       = false,                        \
+        };                                                  \
+        static T Max()                                      \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max(),                \
+                NumericTraits<BaseT>::Max()};               \
+            return retval;                                  \
+        }                                                   \
+        static T Lowest()                                   \
+        {                                                   \
+            T retval = {                                    \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest(),             \
+                NumericTraits<BaseT>::Lowest()};            \
+            return retval;                                  \
+        }                                                   \
+    };                                                      \
+    } /* namespace cub */
+
+/**
+ * All vector overloads
+ */
+#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
+    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
+    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
+    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
+    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
+
+/**
+ * Define for types
+ */
+CUB_VEC_OVERLOAD(char, char)
+CUB_VEC_OVERLOAD(short, short)
+CUB_VEC_OVERLOAD(int, int)
+CUB_VEC_OVERLOAD(long, long)
+CUB_VEC_OVERLOAD(longlong, long long)
+CUB_VEC_OVERLOAD(uchar, unsigned char)
+CUB_VEC_OVERLOAD(ushort, unsigned short)
+CUB_VEC_OVERLOAD(uint, unsigned int)
+CUB_VEC_OVERLOAD(ulong, unsigned long)
+CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
+CUB_VEC_OVERLOAD(float, float)
+CUB_VEC_OVERLOAD(double, double)
+
+
+//---------------------------------------------------------------------
+// Complex data type TestFoo
+//---------------------------------------------------------------------
+
+/**
+ * TestFoo complex data type
+ */
+struct TestFoo
+{
+    long long   x;
+    int         y;
+    short       z;
+    char        w;
+
+    // Factory
+    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
+    {
+        TestFoo retval = {x, y, z, w};
+        return retval;
+    }
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
+    {
+        x = b;
+        y = b;
+        z = b;
+        w = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
+    {
+        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
+    {
+        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
+    {
+        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        if (y < b.y) return true; else if (b.y < y) return false;
+        if (z < b.z) return true; else if (b.z < z) return false;
+        return w < b.w;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        if (y > b.y) return true; else if (b.y > y) return false;
+        if (z > b.z) return true; else if (b.z > z) return false;
+        return w > b.w;
+    }
+
+};
+
+/**
+ * TestFoo ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestFoo& val)
+{
+    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
+    return os;
+}
+
+/**
+ * TestFoo test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+    InitValue(gen_mode, value.z, index);
+    InitValue(gen_mode, value.w, index);
+}
+
+
+/// numeric_limits<TestFoo> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestFoo>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestFoo Max()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max(),
+            NumericTraits<short>::Max(),
+            NumericTraits<char>::Max());
+    }
+
+    static TestFoo Lowest()
+    {
+        return TestFoo::MakeTestFoo(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest(),
+            NumericTraits<short>::Lowest(),
+            NumericTraits<char>::Lowest());
+    }
+};
+} // namespace cub
+
+
+//---------------------------------------------------------------------
+// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
+//---------------------------------------------------------------------
+
+/**
+ * TestBar complex data type
+ */
+struct TestBar
+{
+    long long       x;
+    int             y;
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
+    {}
+
+    // Constructor
+    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
+    {}
+
+    // Assignment from int operator
+    __host__ __device__ __forceinline__ TestBar& operator =(int b)
+    {
+        x = b;
+        y = b;
+        return *this;
+    }
+
+    // Summation operator
+    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
+    {
+        return TestBar(x + b.x, y + b.y);
+    }
+
+    // Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
+    {
+        return (x != b.x) || (y != b.y);
+    }
+
+    // Equality operator
+    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
+    {
+        return (x == b.x) && (y == b.y);
+    }
+
+    // Less than operator
+    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
+    {
+        if (x < b.x) return true; else if (b.x < x) return false;
+        return y < b.y;
+    }
+
+    // Greater than operator
+    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
+    {
+        if (x > b.x) return true; else if (b.x > x) return false;
+        return y > b.y;
+    }
+
+};
+
+
+/**
+ * TestBar ostream operator
+ */
+std::ostream& operator<<(std::ostream& os, const TestBar& val)
+{
+    os << '(' << val.x << ',' << val.y << ')';
+    return os;
+}
+
+/**
+ * TestBar test initialization
+ */
+__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
+{
+    InitValue(gen_mode, value.x, index);
+    InitValue(gen_mode, value.y, index);
+}
+
+/// numeric_limits<TestBar> specialization
+namespace cub {
+template<>
+struct NumericTraits<TestBar>
+{
+    static const Category CATEGORY = NOT_A_NUMBER;
+    enum {
+        PRIMITIVE       = false,
+        NULL_TYPE       = false,
+    };
+    static TestBar Max()
+    {
+        return TestBar(
+            NumericTraits<long long>::Max(),
+            NumericTraits<int>::Max());
+    }
+
+    static TestBar Lowest()
+    {
+        return TestBar(
+            NumericTraits<long long>::Lowest(),
+            NumericTraits<int>::Lowest());
+    }
+};
+} // namespace cub
+
+
+/******************************************************************************
+ * Helper routines for list comparison and display
+ ******************************************************************************/
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename S, typename T, typename OffsetT>
+int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                << CoutCast(computed[i]) << " != "
+                << CoutCast(reference[i]);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            float difference = std::abs(computed[i]-reference[i]);
+            float fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << "(computed) " << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
+{
+    return 0;
+}
+
+/**
+ * Compares the equivalence of two arrays
+ */
+template <typename OffsetT>
+int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
+{
+    for (OffsetT i = 0; i < len; i++)
+    {
+        if (computed[i] != reference[i])
+        {
+            double difference = std::abs(computed[i]-reference[i]);
+            double fraction = difference / std::abs(reference[i]);
+
+            if (fraction > 0.0001)
+            {
+                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
+                    << CoutCast(computed[i]) << " != "
+                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+int CompareDeviceResults(
+    cub::NullType *h_reference,
+    cub::NullType *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    return 0;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a host array
+ */
+template <typename S, typename T>
+int CompareDeviceResults(
+    S *h_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data)
+    {
+        printf("Reference:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < int(num_items); i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Verify the contents of a device array match those
+ * of a device array
+ */
+template <typename T>
+int CompareDeviceDeviceResults(
+    T *d_reference,
+    T *d_data,
+    size_t num_items,
+    bool verbose = true,
+    bool display_data = false)
+{
+    // Allocate array on host
+    T *h_reference = (T*) malloc(num_items * sizeof(T));
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    // Display data
+    if (display_data) {
+        printf("Reference:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_reference[i]) << ", ";
+        }
+        printf("\n\nComputed:\n");
+        for (int i = 0; i < num_items; i++)
+        {
+            std::cout << CoutCast(h_data[i]) << ", ";
+        }
+        printf("\n\n");
+    }
+
+    // Check
+    int retval = CompareResults(h_data, h_reference, num_items, verbose);
+
+    // Cleanup
+    if (h_reference) free(h_reference);
+    if (h_data) free(h_data);
+
+    return retval;
+}
+
+
+/**
+ * Print the contents of a host array
+ */
+void DisplayResults(
+    cub::NullType   *h_data,
+    size_t          num_items)
+{}
+
+
+/**
+ * Print the contents of a host array
+ */
+template <typename InputIteratorT>
+void DisplayResults(
+    InputIteratorT h_data,
+    size_t num_items)
+{
+    // Display data
+    for (int i = 0; i < int(num_items); i++)
+    {
+        std::cout << CoutCast(h_data[i]) << ", ";
+    }
+    printf("\n");
+}
+
+
+/**
+ * Print the contents of a device array
+ */
+template <typename T>
+void DisplayDeviceResults(
+    T *d_data,
+    size_t num_items)
+{
+    // Allocate array on host
+    T *h_data = (T*) malloc(num_items * sizeof(T));
+
+    // Copy data back
+    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
+
+    DisplayResults(h_data, num_items);
+
+    // Cleanup
+    if (h_data) free(h_data);
+}
+
+
+/******************************************************************************
+ * Segment descriptor generation
+ ******************************************************************************/
+
+/**
+ * Initialize segments
+ */
+void InitializeSegments(
+    int     num_items,
+    int     num_segments,
+    int     *h_segment_offsets,
+    bool    verbose = false)
+{
+    if (num_segments <= 0)
+        return;
+
+    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
+    int offset = 0;
+    for (int i = 0; i < num_segments; ++i)
+    {
+        h_segment_offsets[i] = offset;
+
+        unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
+        offset += segment_length;
+        offset = CUB_MIN(offset, num_items);
+    }
+    h_segment_offsets[num_segments] = num_items;
+
+    if (verbose)
+    {
+        printf("Segment offsets: ");
+        DisplayResults(h_segment_offsets, num_segments + 1);
+    }
+}
+
+
+/******************************************************************************
+ * Timing
+ ******************************************************************************/
+
+
+struct CpuTimer
+{
+#if defined(_WIN32) || defined(_WIN64)
+
+    LARGE_INTEGER ll_freq;
+    LARGE_INTEGER ll_start;
+    LARGE_INTEGER ll_stop;
+
+    CpuTimer()
+    {
+        QueryPerformanceFrequency(&ll_freq);
+    }
+
+    void Start()
+    {
+        QueryPerformanceCounter(&ll_start);
+    }
+
+    void Stop()
+    {
+        QueryPerformanceCounter(&ll_stop);
+    }
+
+    float ElapsedMillis()
+    {
+        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
+        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
+
+        return float((stop - start) * 1000);
+    }
+
+#else
+
+    rusage start;
+    rusage stop;
+
+    void Start()
+    {
+        getrusage(RUSAGE_SELF, &start);
+    }
+
+    void Stop()
+    {
+        getrusage(RUSAGE_SELF, &stop);
+    }
+
+    float ElapsedMillis()
+    {
+        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
+        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
+
+        return (sec * 1000) + (usec / 1000);
+    }
+
+#endif
+};
+
+struct GpuTimer
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+
+    GpuTimer()
+    {
+        cudaEventCreate(&start);
+        cudaEventCreate(&stop);
+    }
+
+    ~GpuTimer()
+    {
+        cudaEventDestroy(start);
+        cudaEventDestroy(stop);
+    }
+
+    void Start()
+    {
+        cudaEventRecord(start, 0);
+    }
+
+    void Stop()
+    {
+        cudaEventRecord(stop, 0);
+    }
+
+    float ElapsedMillis()
+    {
+        float elapsed;
+        cudaEventSynchronize(stop);
+        cudaEventElapsedTime(&elapsed, start, stop);
+        return elapsed;
+    }
+};
diff --git a/external/cub/test/test_warp_reduce.cu b/external/cub/test/test_warp_reduce.cu
new file mode 100644
index 00000000000..130f20e3e87
--- /dev/null
+++ b/external/cub/test/test_warp_reduce.cu
@@ -0,0 +1,840 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpReduce utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_reduce.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<
+    typename    OpT,
+    int         LOGICAL_WARP_THREADS>
+struct WrapperFunctor
+{
+    OpT op;
+    int num_valid;
+
+    inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
+
+    template <typename T>
+    inline __host__ __device__ T operator()(const T &a, const T &b) const
+    {
+#if CUB_PTX_ARCH != 0
+        if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
+            cub::ThreadTrap();
+#endif
+
+        return op(a, b);
+    }
+
+};
+
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Generic reduction
+ */
+template <
+    typename    T,
+    typename    ReductionOp,
+    typename    WarpReduce,
+    bool        PRIMITIVE = Traits<T>::PRIMITIVE>
+struct DeviceTest
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        ReductionOp                         &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        ReductionOp                         &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
+    }
+
+};
+
+
+/**
+ * Summation
+ */
+template <
+    typename    T,
+    typename    WarpReduce>
+struct DeviceTest<T, Sum, WarpReduce, true>
+{
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).Sum(data);
+    }
+
+    static __device__ __forceinline__ T Reduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        Sum                              &reduction_op,
+        const int                           &valid_warp_threads)
+    {
+        return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T HeadSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
+    }
+
+    template <typename FlagT>
+    static __device__ __forceinline__ T TailSegmentedReduce(
+        typename WarpReduce::TempStorage    &temp_storage,
+        T                                   &data,
+        FlagT                                &flag,
+        Sum                              &reduction_op)
+    {
+        return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
+    }
+
+};
+
+
+/**
+ * Full-tile warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void FullWarpReduceKernel(
+    T               *d_in,
+    T               *d_out,
+    ReductionOp     reduction_op,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-reduce utility type (1 warp)
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+/**
+ * Partially-full warp reduction kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+__global__ void PartialWarpReduceKernel(
+    T           *d_in,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed,
+    int         valid_warp_threads)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T input = d_in[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test partial-warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
+        temp_storage[warp_id], input, reduction_op, valid_warp_threads);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
+        output :
+        input;
+}
+
+
+/**
+ * Head-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpHeadSegmentedReduceKernel(
+    T           *d_in,
+    FlagT        *d_head_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT   head_flag   = d_head_flags[threadIdx.x];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
+        temp_storage[warp_id], input, head_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+/**
+ * Tail-based segmented warp reduction test kernel
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    FlagT,
+    typename    ReductionOp>
+__global__ void WarpTailSegmentedReduceKernel(
+    T           *d_in,
+    FlagT       *d_tail_flags,
+    T           *d_out,
+    ReductionOp reduction_op,
+    clock_t     *d_elapsed)
+{
+    // Cooperative warp-reduce utility type
+    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
+
+    // Per-thread tile data
+    T       input       = d_in[threadIdx.x];
+    FlagT    tail_flag   = d_tail_flags[threadIdx.x];
+    FlagT    head_flag   = (threadIdx.x == 0) ?
+                            0 :
+                            d_tail_flags[threadIdx.x - 1];
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Test segmented warp reduce
+    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
+    T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
+        temp_storage[warp_id], input, tail_flag, reduction_op);
+
+    // Record elapsed clocks
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    *d_elapsed = stop - start;
+
+    // Store aggregate
+    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
+        output :
+        input;
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize reduction problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ReductionOp>
+void Initialize(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    T           *h_in,
+    int         *h_flags,
+    int         warps,
+    int         warp_threads,
+    int         valid_warp_threads,
+    ReductionOp reduction_op,
+    T           *h_head_out,
+    T           *h_tail_out)
+{
+    for (int i = 0; i < warps * warp_threads; ++i)
+    {
+        // Sample a value for this item
+        InitValue(gen_mode, h_in[i], i);
+        h_head_out[i] = h_in[i];
+        h_tail_out[i] = h_in[i];
+
+        // Sample whether or not this item will be a segment head
+        char bits;
+        RandomBits(bits, flag_entropy);
+        h_flags[i] = bits & 0x1;
+    }
+
+    // Accumulate segments (lane 0 of each warp is implicitly a segment head)
+    for (int warp = 0; warp < warps; ++warp)
+    {
+        int warp_offset  = warp * warp_threads;
+        int item_offset = warp_offset + valid_warp_threads - 1;
+
+        // Last item in warp
+        T head_aggregate = h_in[item_offset];
+        T tail_aggregate = h_in[item_offset];
+
+        if (h_flags[item_offset])
+            h_head_out[item_offset] = head_aggregate;
+        item_offset--;
+
+        // Work backwards
+        while (item_offset >= warp_offset)
+        {
+            if (h_flags[item_offset + 1])
+            {
+                head_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
+            }
+
+            if (h_flags[item_offset])
+            {
+                h_head_out[item_offset] = head_aggregate;
+                h_tail_out[item_offset + 1] = tail_aggregate;
+                tail_aggregate = h_in[item_offset];
+            }
+            else
+            {
+                tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
+            }
+
+            item_offset--;
+        }
+
+        // Record last segment head_aggregate to head offset
+        h_head_out[warp_offset] = head_aggregate;
+        h_tail_out[warp_offset] = tail_aggregate;
+    }
+}
+
+
+/**
+ * Test warp reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestReduce(
+    GenMode     gen_mode,
+    ReductionOp reduction_op,
+    int         valid_warp_threads = LOGICAL_WARP_THREADS)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_out          = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    clock_t *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
+    }
+
+    // Run kernel
+    printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
+        gen_mode,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        valid_warp_threads,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    if (valid_warp_threads == LOGICAL_WARP_THREADS)
+    {
+        // Run full-warp kernel
+        FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed);
+    }
+    else
+    {
+        // Run partial-warp kernel
+        PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+            d_in,
+            d_out,
+            reduction_op,
+            d_elapsed,
+            valid_warp_threads);
+    }
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tReduction results: ");
+    int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_out) delete[] h_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Test warp segmented reduction
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void TestSegmentedReduce(
+    GenMode     gen_mode,
+    int         flag_entropy,
+    ReductionOp reduction_op)
+{
+    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
+
+    // Allocate host arrays
+    int compare;
+    T   *h_in           = new T[BLOCK_THREADS];
+    int *h_flags        = new int[BLOCK_THREADS];
+    T   *h_head_out     = new T[BLOCK_THREADS];
+    T   *h_tail_out     = new T[BLOCK_THREADS];
+
+    // Initialize problem
+    Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
+
+    // Initialize/clear device arrays
+    T           *d_in = NULL;
+    int         *d_flags = NULL;
+    T           *d_head_out = NULL;
+    T           *d_tail_out = NULL;
+    clock_t     *d_elapsed = NULL;
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
+    CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
+
+    if (g_verbose)
+    {
+        printf("Data:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+
+        printf("\nFlags:\n");
+        for (int i = 0; i < WARPS; ++i)
+            DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
+    }
+
+    printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
+        gen_mode,
+        flag_entropy,
+        WARPS,
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run head-based kernel
+    WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_head_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tHead-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Run tail-based kernel
+    WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
+        d_in,
+        d_flags,
+        d_tail_out,
+        reduction_op,
+        d_elapsed);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tTail-based segmented reduction results: ");
+    compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_flags) delete[] h_flags;
+    if (h_head_out) delete[] h_head_out;
+    if (h_tail_out) delete[] h_tail_out;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
+    if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
+    if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different full and partial tile sizes
+ */
+template <
+    int         WARPS,
+    int         LOGICAL_WARP_THREADS,
+    typename    T,
+    typename    ReductionOp>
+void Test(
+    GenMode     gen_mode,
+    ReductionOp reduction_op)
+{
+    // Partial tiles
+    for (
+        int valid_warp_threads = 1;
+        valid_warp_threads < LOGICAL_WARP_THREADS;
+        valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
+    {
+        // Without wrapper (to test non-excepting PTX POD-op specializations)
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
+
+        // With wrapper to ensure no ops called on OOB lanes
+        WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
+        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
+    }
+
+    // Full tile
+    TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
+
+    // Segmented reduction with different head flags
+    for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
+    {
+        TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
+    }
+}
+
+
+/**
+ * Run battery of tests for different data types and reduce ops
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // primitive
+    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());
+
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Sum());
+
+    if (gen_mode != RANDOM)
+    {
+        Test<WARPS, LOGICAL_WARP_THREADS, float>(           gen_mode, Sum());
+        Test<WARPS, LOGICAL_WARP_THREADS, double>(          gen_mode, Sum());
+    }
+
+    // primitive (alternative reduce op)
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Max());
+    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Max());
+
+    // vec-1
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar1>(              gen_mode, Sum());
+
+    // vec-2
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar2>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort2>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint2>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>(          gen_mode, Sum());
+
+    // vec-4
+    Test<WARPS, LOGICAL_WARP_THREADS, uchar4>(              gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ushort4>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, uint4>(               gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>(          gen_mode, Sum());
+
+    // complex
+    Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>(             gen_mode, Sum());
+    Test<WARPS, LOGICAL_WARP_THREADS, TestBar>(             gen_mode, Sum());
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <
+    int WARPS,
+    int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Run battery of tests for different number of active warps
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<1, LOGICAL_WARP_THREADS>();
+
+    // Only power-of-two subwarps can be tiled
+    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
+        Test<2, LOGICAL_WARP_THREADS>();
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    TestReduce<1, 32, int>(UNIFORM, Sum());
+
+    TestReduce<1, 32, double>(UNIFORM, Sum());
+    TestReduce<2, 16, TestBar>(UNIFORM, Sum());
+    TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<7>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/external/cub/test/test_warp_scan.cu b/external/cub/test/test_warp_scan.cu
new file mode 100644
index 00000000000..69a60113495
--- /dev/null
+++ b/external/cub/test/test_warp_scan.cu
@@ -0,0 +1,630 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of WarpScan utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <stdio.h>
+#include <typeinfo>
+
+#include <cub/warp/warp_scan.cuh>
+#include <cub/util_allocator.cuh>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose       = false;
+int                     g_repeat        = 0;
+CachingDeviceAllocator  g_allocator(true);
+
+
+/**
+ * Primitive variant to test
+ */
+enum TestMode
+{
+    BASIC,
+    AGGREGATE,
+};
+
+
+
+/**
+ * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
+ */
+template<typename OpT>
+struct WrapperFunctor
+{
+    OpT op;
+
+    WrapperFunctor(OpT op) : op(op) {}
+
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return op(a, b);
+    }
+};
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/// Exclusive scan basic
+template <typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op);
+}
+
+/// Exclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate);
+}
+
+
+/// Exclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.ExclusiveSum(data, data);
+}
+
+
+/// Exclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    T                               &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.ExclusiveSum(data, data, aggregate);
+}
+
+
+/// Inclusive scan basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveScan(data, data, scan_op);
+}
+
+/// Inclusive scan aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    ScanOpT,
+    typename    IsPrimitiveT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    ScanOpT                         &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    IsPrimitiveT                    is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveScan(data, data, scan_op, aggregate);
+}
+
+/// Inclusive sum basic
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<BASIC>                 test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test basic warp scan
+    warp_scan.InclusiveSum(data, data);
+}
+
+/// Inclusive sum aggregate
+template <
+    typename    WarpScanT,
+    typename    T,
+    typename    InitialValueT>
+__device__ __forceinline__ void DeviceTest(
+    WarpScanT                       &warp_scan,
+    T                               &data,
+    NullType                        &initial_value,
+    Sum                             &scan_op,
+    T                               &aggregate,
+    Int2Type<AGGREGATE>             test_mode,
+    Int2Type<true>                  is_primitive)
+{
+    // Test with cumulative aggregate
+    warp_scan.InclusiveSum(data, data, aggregate);
+}
+
+
+/**
+ * WarpScan test kernel
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    TestMode    TEST_MODE,
+    typename    T,
+    typename    ScanOpT,
+    typename    InitialValueT>
+__global__ void WarpScanKernel(
+    T               *d_in,
+    T               *d_out,
+    T               *d_aggregate,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value,
+    clock_t         *d_elapsed)
+{
+    // Cooperative warp-scan utility type (1 warp)
+    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
+
+    // Allocate temp storage in shared memory
+    __shared__ typename WarpScanT::TempStorage temp_storage;
+
+    // Per-thread tile data
+    T data = d_in[threadIdx.x];
+
+    // Start cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t start = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    T aggregate;
+
+    // Test scan
+    WarpScanT warp_scan(temp_storage);
+    DeviceTest(
+        warp_scan,
+        data,
+        initial_value,
+        scan_op,
+        aggregate,
+        Int2Type<TEST_MODE>(),
+        Int2Type<Traits<T>::PRIMITIVE>());
+
+    // Stop cycle timer
+    __threadfence_block();      // workaround to prevent clock hoisting
+    clock_t stop = clock();
+    __threadfence_block();      // workaround to prevent clock hoisting
+
+    // Store data
+    d_out[threadIdx.x] = data;
+
+    if (TEST_MODE != BASIC)
+    {
+        // Store aggregate
+        d_aggregate[threadIdx.x] = aggregate;
+    }
+
+    // Store time
+    if (threadIdx.x == 0)
+    {
+        *d_elapsed = (start > stop) ? start - stop : stop - start;
+    }
+}
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize exclusive-scan problem (and solution)
+ */
+template <
+    typename        T,
+    typename        ScanOpT>
+T Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    T               *h_reference,
+    int             num_items,
+    ScanOpT         scan_op,
+    T               initial_value)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    h_reference[0]      = initial_value;
+    T inclusive         = scan_op(initial_value, h_in[0]);
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        h_reference[i] = inclusive;
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Initialize inclusive-scan problem (and solution)
+ */
+template <
+    typename    T,
+    typename    ScanOpT>
+T Initialize(
+    GenMode     gen_mode,
+    T           *h_in,
+    T           *h_reference,
+    int         num_items,
+    ScanOpT     scan_op,
+    NullType)
+{
+    InitValue(gen_mode, h_in[0], 0);
+
+    T block_aggregate   = h_in[0];
+    T inclusive         = h_in[0];
+    h_reference[0]      = inclusive;
+
+    for (int i = 1; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+        inclusive = scan_op(inclusive, h_in[i]);
+        block_aggregate = scan_op(block_aggregate, h_in[i]);
+        h_reference[i] = inclusive;
+    }
+
+    return block_aggregate;
+}
+
+
+/**
+ * Test warp scan
+ */
+template <
+    int             LOGICAL_WARP_THREADS,
+    TestMode        TEST_MODE,
+    typename        T,
+    typename        ScanOpT,
+    typename        InitialValueT>        // NullType implies inclusive-scan, otherwise inclusive scan
+void Test(
+    GenMode         gen_mode,
+    ScanOpT         scan_op,
+    InitialValueT   initial_value)
+{
+    // Allocate host arrays
+    T *h_in = new T[LOGICAL_WARP_THREADS];
+    T *h_reference = new T[LOGICAL_WARP_THREADS];
+    T *h_aggregate = new T[LOGICAL_WARP_THREADS];
+
+    // Initialize problem
+    T aggregate = Initialize(
+        gen_mode,
+        h_in,
+        h_reference,
+        LOGICAL_WARP_THREADS,
+        scan_op,
+        initial_value);
+
+    if (g_verbose)
+    {
+        printf("Input: \n");
+        DisplayResults(h_in, LOGICAL_WARP_THREADS);
+        printf("\n");
+    }
+
+    for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
+    {
+        h_aggregate[i] = aggregate;
+    }
+
+    // Initialize/clear device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    T *d_aggregate = NULL;
+    clock_t *d_elapsed = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * LOGICAL_WARP_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (LOGICAL_WARP_THREADS + 1)));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * LOGICAL_WARP_THREADS));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * LOGICAL_WARP_THREADS, cudaMemcpyHostToDevice));
+    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (LOGICAL_WARP_THREADS + 1)));
+    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * LOGICAL_WARP_THREADS));
+
+    // Run kernel
+    printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
+        TEST_MODE, typeid(TEST_MODE).name(),
+        gen_mode, typeid(gen_mode).name(),
+        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
+        LOGICAL_WARP_THREADS,
+        typeid(T).name(),
+        (int) sizeof(T));
+    fflush(stdout);
+
+    // Run aggregate/prefix kernel
+    WarpScanKernel<LOGICAL_WARP_THREADS, TEST_MODE><<<1, LOGICAL_WARP_THREADS>>>(
+        d_in,
+        d_out,
+        d_aggregate,
+        scan_op,
+        initial_value,
+        d_elapsed);
+
+    printf("\tElapsed clocks: ");
+    DisplayDeviceResults(d_elapsed, 1);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Copy out and display results
+    printf("\tScan results: ");
+    int compare = CompareDeviceResults(h_reference, d_out, LOGICAL_WARP_THREADS, g_verbose, g_verbose);
+    printf("%s\n", compare ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Copy out and display aggregate
+    if (TEST_MODE == AGGREGATE)
+    {
+        printf("\tScan aggregate: ");
+        compare = CompareDeviceResults(h_aggregate, d_aggregate, LOGICAL_WARP_THREADS, g_verbose, g_verbose);
+        printf("%s\n", compare ? "FAIL" : "PASS");
+        AssertEquals(0, compare);
+    }
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (h_reference) delete[] h_reference;
+    if (h_aggregate) delete[] h_aggregate;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
+    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
+}
+
+
+/**
+ * Run battery of tests for different primitive variants
+ */
+template <
+    int         LOGICAL_WARP_THREADS,
+    typename    ScanOpT,
+    typename    T>
+void Test(
+    GenMode     gen_mode,
+    ScanOpT     scan_op,
+    T           initial_value)
+{
+    // Exclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, T());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, T());
+
+    // Exclusive (non-specialized, so we can use initial-value)
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
+
+    // Inclusive
+    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, NullType());
+    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, NullType());
+}
+
+
+/**
+ * Run battery of tests for different data types and scan ops
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test(GenMode gen_mode)
+{
+    // Get device ordinal
+    int device_ordinal;
+    CubDebugExit(cudaGetDevice(&device_ordinal));
+
+    // Get ptx version
+    int ptx_version;
+    CubDebugExit(PtxVersion(ptx_version));
+
+    // primitive
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long long) 99);
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (float) 99);
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (double) 99);
+    }
+
+    // primitive (alternative scan op)
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned char) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned short) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned int) 99);
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned long long) 99);
+
+    // vec-2
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uchar2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ushort2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uint2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulong2(17, 21));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulonglong2(17, 21));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float2(17, 21));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double2(17, 21));
+    }
+
+    // vec-4
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_char4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_short4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_int4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_long4(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_longlong4(17, 21, 32, 85));
+    if (gen_mode != RANDOM) {
+        // Only test numerically stable inputs
+        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float4(17, 21, 32, 85));
+        if (ptx_version > 100)
+            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double4(17, 21, 32, 85));
+    }
+
+    // complex
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
+    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestBar(17, 21));
+
+}
+
+
+/**
+ * Run battery of tests for different problem generation options
+ */
+template <int LOGICAL_WARP_THREADS>
+void Test()
+{
+    Test<LOGICAL_WARP_THREADS>(UNIFORM);
+    Test<LOGICAL_WARP_THREADS>(INTEGER_SEED);
+    Test<LOGICAL_WARP_THREADS>(RANDOM);
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+    args.GetCmdLineArgument("repeat", g_repeat);
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--repeat=<repetitions of entire test suite>]"
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#ifdef QUICK_TEST
+
+    // Compile/run quick tests
+    Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0);
+    Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0);
+    Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0);
+    Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0);
+
+    typedef KeyValuePair<int, float> T;
+    cub::Sum sum_op;
+    Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp<cub::Sum>(sum_op), T());
+
+#else
+
+    // Compile/run thorough tests
+    for (int i = 0; i <= g_repeat; ++i)
+    {
+        // Test logical warp sizes
+        Test<32>();
+        Test<16>();
+        Test<9>();
+        Test<7>();
+    }
+
+#endif
+
+    return 0;
+}
+
+
+
+
diff --git a/external/cub/tune/Makefile b/external/cub/tune/Makefile
new file mode 100644
index 00000000000..cf55efa3001
--- /dev/null
+++ b/external/cub/tune/Makefile
@@ -0,0 +1,192 @@
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+ 
+#-------------------------------------------------------------------------------
+# Build script for project
+#-------------------------------------------------------------------------------
+
+NVCC = "$(shell which nvcc)"
+NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+#-------------------------------------------------------------------------------
+# Libs
+#-------------------------------------------------------------------------------
+
+
+#-------------------------------------------------------------------------------
+# Includes
+#-------------------------------------------------------------------------------
+
+INC = -I. -I.. -I../test
+
+#-------------------------------------------------------------------------------
+# Libs
+#-------------------------------------------------------------------------------
+
+LIBS += -lcudart 
+
+#-------------------------------------------------------------------------------
+# Defines
+#-------------------------------------------------------------------------------
+
+DEFINES = 
+
+#-------------------------------------------------------------------------------
+# SM Arch
+#-------------------------------------------------------------------------------
+
+ifdef sm
+	SM_ARCH = $(sm)
+else 
+    SM_ARCH = 200
+endif
+
+# Only one arch per tuning binary
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_35
+    SM_ARCH = 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_30
+    SM_ARCH = 300
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_20
+    SM_ARCH = 200
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_13
+    SM_ARCH = 130
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_11 
+    SM_ARCH = 110
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+    SM_TARGETS = -arch=sm_10 
+    SM_ARCH = 100
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler Flags
+#-------------------------------------------------------------------------------
+
+NVCCFLAGS = -Xptxas -v -Xcudafe -\#
+
+# Help the compiler/linker work with huge numbers of kernels on Windows
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+endif
+
+# 32/64-bit (32-bit device pointers by default) 
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+endif
+
+# CUDA ABI enable/disable (enabled by default) 
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else 
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+# NVVM/Open64 middle-end compiler (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else 
+	PTX_SUFFIX = nvvm
+endif
+
+# Verbose toolchain output from nvcc
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+# Keep intermediate compilation artifacts
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# Data type size to compile a schmoo binary for
+ifdef tunesize
+    TUNE_SIZE = $(tunesize)
+else 
+	TUNE_SIZE = 4
+endif
+
+
+SUFFIX = $(TUNE_SIZE)B_sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+DEPS =	 ./Makefile \
+		../test/test_util.h \
+		$(call rwildcard,../cub/,*.cuh)
+
+
+#-------------------------------------------------------------------------------
+# make default
+#-------------------------------------------------------------------------------
+
+default:
+
+
+#-------------------------------------------------------------------------------
+# make clean
+#-------------------------------------------------------------------------------
+
+clean :
+	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
+	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
+
+
+
+#-------------------------------------------------------------------------------
+# make tune_device_reduce
+#-------------------------------------------------------------------------------
+
+tune_device_reduce: bin/tune_device_reduce_$(SUFFIX)
+
+bin/tune_device_reduce_$(SUFFIX) : tune_device_reduce.cu $(DEPS)
+	mkdir -p bin
+	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/tune_device_reduce_$(SUFFIX) tune_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE)
+
diff --git a/external/cub/tune/tune_device_reduce.cu b/external/cub/tune/tune_device_reduce.cu
new file mode 100644
index 00000000000..090e763ce29
--- /dev/null
+++ b/external/cub/tune/tune_device_reduce.cu
@@ -0,0 +1,763 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Evaluates different tuning configurations of DeviceReduce.
+ *
+ * The best way to use this program:
+ * (1) Find the best all-around single-block tune for a given arch.
+ *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
+ * (2) Update the single tune in device_reduce.cuh
+ * (3) Find the best all-around multi-block tune for a given arch.
+ *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
+ *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
+ * (4) Update the multi-block tune in device_reduce.cuh
+ *
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+#include <vector>
+#include <algorithm>
+#include <stdio.h>
+#include <cub/cub.cuh>
+#include "../test/test_util.h"
+
+using namespace cub;
+using namespace std;
+
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+#ifndef TUNE_ARCH
+#define TUNE_ARCH 100
+#endif
+
+int     g_max_items         = 48 * 1024 * 1024;
+int     g_samples           = 100;
+int     g_timing_iterations        = 2;
+bool    g_verbose           = false;
+bool    g_single            = false;
+bool    g_verify            = true;
+CachingDeviceAllocator  g_allocator;
+
+
+//---------------------------------------------------------------------
+// Host utility subroutines
+//---------------------------------------------------------------------
+
+/**
+ * Initialize problem
+ */
+template <typename T>
+void Initialize(
+    GenMode         gen_mode,
+    T               *h_in,
+    int             num_items)
+{
+    for (int i = 0; i < num_items; ++i)
+    {
+        InitValue(gen_mode, h_in[i], i);
+    }
+}
+
+/**
+ * Sequential reduction
+ */
+template <typename T, typename ReductionOp>
+T Reduce(
+    T               *h_in,
+    ReductionOp     reduction_op,
+    int             num_items)
+{
+    T retval = h_in[0];
+    for (int i = 1; i < num_items; ++i)
+        retval = reduction_op(retval, h_in[i]);
+
+    return retval;
+}
+
+
+
+//---------------------------------------------------------------------
+// Full tile test generation
+//---------------------------------------------------------------------
+
+
+
+/**
+ * Wrapper structure for generating and running different tuning configurations
+ */
+template <
+    typename T,
+    typename OffsetT,
+    typename ReductionOp>
+struct Schmoo
+{
+    //---------------------------------------------------------------------
+    // Types
+    //---------------------------------------------------------------------
+
+    /// Pairing of kernel function pointer and corresponding dispatch params
+    template <typename KernelPtr>
+    struct DispatchTuple
+    {
+        KernelPtr                           kernel_ptr;
+        DeviceReduce::KernelDispachParams   params;
+
+        float                               avg_throughput;
+        float                               best_avg_throughput;
+        OffsetT                              best_size;
+        float                               hmean_speedup;
+
+
+        DispatchTuple() :
+            kernel_ptr(0),
+            params(DeviceReduce::KernelDispachParams()),
+            avg_throughput(0.0),
+            best_avg_throughput(0.0),
+            hmean_speedup(0.0),
+            best_size(0)
+        {}
+    };
+
+    /**
+     * Comparison operator for DispatchTuple.avg_throughput
+     */
+    template <typename Tuple>
+    static bool MinSpeedup(const Tuple &a, const Tuple &b)
+    {
+        float delta = a.hmean_speedup - b.hmean_speedup;
+
+        return ((delta < 0.02) && (delta > -0.02)) ?
+            (a.best_avg_throughput < b.best_avg_throughput) :       // Negligible average performance differences: defer to best performance
+            (a.hmean_speedup < b.hmean_speedup);
+    }
+
+
+
+    /// Multi-block reduction kernel type and dispatch tuple type
+    typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
+    typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
+
+    /// Single-block reduction kernel type and dispatch tuple type
+    typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
+    typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    vector<MultiDispatchTuple> multi_kernels;       // List of generated multi-block kernels
+    vector<SingleDispatchTuple> single_kernels;     // List of generated single-block kernels
+
+
+    //---------------------------------------------------------------------
+    // Kernel enumeration methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Must have smem that fits in the SM
+     * Must have vector load length that divides items per thread
+     */
+    template <typename TilesReducePolicy, typename ReductionOp>
+    struct SmemSize
+    {
+        enum
+        {
+            BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
+            IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
+                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
+        };
+    };
+
+
+    /**
+     * Specialization that allows kernel generation with the specified TilesReducePolicy
+     */
+    template <
+        typename    TilesReducePolicy,
+        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
+    struct Ok
+    {
+        /// Enumerate multi-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateMulti(
+            KernelsVector &multi_kernels,
+            int subscription_factor)
+        {
+            MultiDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
+            tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            multi_kernels.push_back(tuple);
+        }
+
+
+        /// Enumerate single-block kernel and add to the list
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels)
+        {
+            SingleDispatchTuple tuple;
+            tuple.params.template Init<TilesReducePolicy>();
+            tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
+            single_kernels.push_back(tuple);
+        }
+    };
+
+    /**
+     * Specialization that rejects kernel generation with the specified TilesReducePolicy
+     */
+    template <typename TilesReducePolicy>
+    struct Ok<TilesReducePolicy, false>
+    {
+        template <typename KernelsVector>
+        static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
+
+        template <typename KernelsVector>
+        static void GenerateSingle(KernelsVector &single_kernels) {}
+    };
+
+
+    /// Enumerate block-scheduling variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM,
+        CacheLoadModifier      LOAD_MODIFIER>
+    void Enumerate()
+    {
+        // Multi-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
+#if TUNE_ARCH >= 200
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
+#endif
+
+        // Single-block kernels
+        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
+    }
+
+
+    /// Enumerate load modifier variations
+    template <
+        int                     BLOCK_THREADS,
+        int                     ITEMS_PER_THREAD,
+        int                     VECTOR_LOAD_LENGTH,
+        BlockReduceAlgorithm    BLOCK_ALGORITHM>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
+#if TUNE_ARCH >= 350
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
+#endif
+    }
+
+
+    /// Enumerate block algorithms
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD,
+        int VECTOR_LOAD_LENGTH>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
+    }
+
+
+    /// Enumerate vectorization variations
+    template <
+        int BLOCK_THREADS,
+        int ITEMS_PER_THREAD>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
+        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
+    }
+
+
+    /// Enumerate thread-granularity variations
+    template <int BLOCK_THREADS>
+    void Enumerate()
+    {
+        Enumerate<BLOCK_THREADS, 7>();
+        Enumerate<BLOCK_THREADS, 8>();
+        Enumerate<BLOCK_THREADS, 9>();
+
+        Enumerate<BLOCK_THREADS, 11>();
+        Enumerate<BLOCK_THREADS, 12>();
+        Enumerate<BLOCK_THREADS, 13>();
+
+        Enumerate<BLOCK_THREADS, 15>();
+        Enumerate<BLOCK_THREADS, 16>();
+        Enumerate<BLOCK_THREADS, 17>();
+
+        Enumerate<BLOCK_THREADS, 19>();
+        Enumerate<BLOCK_THREADS, 20>();
+        Enumerate<BLOCK_THREADS, 21>();
+
+        Enumerate<BLOCK_THREADS, 23>();
+        Enumerate<BLOCK_THREADS, 24>();
+        Enumerate<BLOCK_THREADS, 25>();
+    }
+
+
+    /// Enumerate block size variations
+    void Enumerate()
+    {
+        printf("\nEnumerating kernels\n"); fflush(stdout);
+
+        Enumerate<32>();
+        Enumerate<64>();
+        Enumerate<96>();
+        Enumerate<128>();
+        Enumerate<160>();
+        Enumerate<192>();
+        Enumerate<256>();
+        Enumerate<512>();
+    }
+
+
+    //---------------------------------------------------------------------
+    // Test methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Test a configuration
+     */
+    void TestConfiguration(
+        MultiDispatchTuple      &multi_dispatch,
+        SingleDispatchTuple     &single_dispatch,
+        T*                      d_in,
+        T*                      d_out,
+        T*                      h_reference,
+        OffsetT                  num_items,
+        ReductionOp             reduction_op)
+    {
+        // Clear output
+        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
+
+        // Allocate temporary storage
+        void            *d_temp_storage = NULL;
+        size_t          temp_storage_bytes = 0;
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
+
+        // Warmup/correctness iteration
+        CubDebugExit(DeviceReduce::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            multi_dispatch.kernel_ptr,
+            single_dispatch.kernel_ptr,
+            FillAndResetDrainKernel<OffsetT>,
+            multi_dispatch.params,
+            single_dispatch.params,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op));
+
+        if (g_verify) CubDebugExit(cudaDeviceSynchronize());
+
+        // Copy out and display results
+        int compare = (g_verify) ?
+            CompareDeviceResults(h_reference, d_out, 1, true, false) :
+            0;
+
+        // Performance
+        GpuTimer gpu_timer;
+        float elapsed_millis = 0.0;
+        for (int i = 0; i < g_timing_iterations; i++)
+        {
+            gpu_timer.Start();
+
+            CubDebugExit(DeviceReduce::Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                multi_dispatch.kernel_ptr,
+                single_dispatch.kernel_ptr,
+                FillAndResetDrainKernel<OffsetT>,
+                multi_dispatch.params,
+                single_dispatch.params,
+                d_in,
+                d_out,
+                num_items,
+                reduction_op));
+
+            gpu_timer.Stop();
+            elapsed_millis += gpu_timer.ElapsedMillis();
+        }
+
+        // Mooch
+        CubDebugExit(cudaDeviceSynchronize());
+
+        float avg_elapsed = elapsed_millis / g_timing_iterations;
+        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
+        float avg_bandwidth = avg_throughput * sizeof(T);
+
+        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
+        if (avg_throughput > multi_dispatch.best_avg_throughput)
+        {
+            multi_dispatch.best_avg_throughput = avg_throughput;
+            multi_dispatch.best_size = num_items;
+        }
+
+        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
+        if (avg_throughput > single_dispatch.best_avg_throughput)
+        {
+            single_dispatch.best_avg_throughput = avg_throughput;
+            single_dispatch.best_size = num_items;
+        }
+
+        if (g_verbose)
+        {
+            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
+            multi_dispatch.params.Print();
+            printf(" ), single_dispatch( ");
+            single_dispatch.params.Print();
+            printf(" )\n");
+            fflush(stdout);
+        }
+
+        AssertEquals(0, compare);
+
+        // Cleanup temporaries
+        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
+    }
+
+
+    /**
+     * Evaluate multi-block configurations
+     */
+    void TestMulti(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+    {
+        // Simple single kernel tuple for use with multi kernel sweep
+        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
+        SingleDispatchTuple simple_single_tuple;
+        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
+        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
+
+        double max_exponent      = log2(double(g_max_items));
+        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
+        unsigned int max_int     = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nMulti-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2^g_min_exponent, g_max_items].  First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+
+                if (sample < g_samples / 2)
+                {
+                    // log bias
+                    double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
+                    num_items = pow(2.0, exponent);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+                }
+                else
+                {
+                    // uniform bias
+                    num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
+                    num_items = CUB_MIN(num_items, g_max_items);
+                    printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
+                }
+            }
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each multi-kernel configuration
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < multi_kernels.size(); ++j)
+            {
+                multi_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < multi_kernels.size(); ++j)
+                multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
+            multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
+
+        // Print ranked multi configurations
+        printf("\nRanked multi_kernels:\n");
+        for (int j = 0; j < multi_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", multi_kernels.size() - j);
+            multi_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                multi_kernels[j].hmean_speedup,
+                multi_kernels[j].best_avg_throughput,
+                (int) multi_kernels[j].best_size,
+                multi_kernels[j].best_avg_throughput * sizeof(T),
+                multi_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+
+    /**
+     * Evaluate single-block configurations
+     */
+    void TestSingle(
+        T*                      h_in,
+        T*                      d_in,
+        T*                      d_out,
+        ReductionOp             reduction_op)
+     {
+        // Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
+        MultiDispatchTuple multi_tuple;
+
+        double max_exponent     = log2(double(g_max_items));
+        unsigned int max_int    = (unsigned int) -1;
+
+        for (int sample = 0; sample < g_samples; ++sample)
+        {
+            printf("\nSingle-block sample %d, ", sample);
+
+            int num_items;
+            if (sample == 0)
+            {
+                // First sample: use max items
+                num_items = g_max_items;
+                printf("num_items: %d", num_items); fflush(stdout);
+            }
+            else
+            {
+                // Sample a problem size from [2, g_max_items], log-distributed
+                unsigned int bits;
+                RandomBits(bits);
+                double scale = double(bits) / max_int;
+                double exponent = ((max_exponent - 1) * scale) + 1;
+                num_items = pow(2.0, exponent);
+                printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
+            }
+
+            if (g_verbose)
+                printf("\n");
+            else
+                printf(", ");
+
+            // Compute reference
+            T h_reference = Reduce(h_in, reduction_op, num_items);
+
+            // Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
+            float best_avg_throughput = 0.0;
+            for (int j = 0; j < single_kernels.size(); ++j)
+            {
+                single_kernels[j].avg_throughput = 0.0;
+
+                TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
+
+                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
+            }
+
+            // Print best throughput for this problem size
+            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
+
+            // Accumulate speedup (inverse for harmonic mean)
+            for (int j = 0; j < single_kernels.size(); ++j)
+                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
+        }
+
+        // Find max overall throughput and compute hmean speedups
+        float overall_max_throughput = 0.0;
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
+            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
+        }
+
+        // Sort by cumulative speedup
+        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
+
+        // Print ranked single configurations
+        printf("\nRanked single_kernels:\n");
+        for (int j = 0; j < single_kernels.size(); ++j)
+        {
+            printf("\t (%d) params( ", single_kernels.size() - j);
+            single_kernels[j].params.Print();
+            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
+                single_kernels[j].hmean_speedup,
+                single_kernels[j].best_avg_throughput,
+                (int) single_kernels[j].best_size,
+                single_kernels[j].best_avg_throughput * sizeof(T),
+                single_kernels[j].best_avg_throughput / overall_max_throughput);
+        }
+
+        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
+    }
+
+};
+
+
+
+//---------------------------------------------------------------------
+// Main
+//---------------------------------------------------------------------
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    args.GetCmdLineArgument("n", g_max_items);
+    args.GetCmdLineArgument("s", g_samples);
+    args.GetCmdLineArgument("i", g_timing_iterations);
+    g_verbose = args.CheckCmdLineFlag("v");
+    g_single = args.CheckCmdLineFlag("single");
+    g_verify = !args.CheckCmdLineFlag("noverify");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--n=<max items>]"
+            "[--s=<samples>]"
+            "[--i=<timing iterations>]"
+            "[--single]"
+            "[--v]"
+            "[--noverify]"
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+#if (TUNE_SIZE == 1)
+    typedef unsigned char T;
+#elif (TUNE_SIZE == 2)
+    typedef unsigned short T;
+#elif (TUNE_SIZE == 4)
+    typedef unsigned int T;
+#elif (TUNE_SIZE == 8)
+    typedef unsigned long long T;
+#else
+    // Default
+    typedef unsigned int T;
+#endif
+
+    typedef unsigned int OffsetT;
+    Sum reduction_op;
+
+    // Enumerate kernels
+    Schmoo<T, OffsetT, Sum > schmoo;
+    schmoo.Enumerate();
+
+    // Allocate host arrays
+    T *h_in = new T[g_max_items];
+
+    // Initialize problem
+    Initialize(UNIFORM, h_in, g_max_items);
+
+    // Initialize device arrays
+    T *d_in = NULL;
+    T *d_out = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
+    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
+
+    // Test kernels
+    if (g_single)
+        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
+    else
+        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
+
+    // Cleanup
+    if (h_in) delete[] h_in;
+    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
+    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
+
+    return 0;
+}
+
+
+
diff --git a/include/functions.h b/include/functions.h
index d5fb09604ff..a1b15495af3 100644
--- a/include/functions.h
+++ b/include/functions.h
@@ -60,3 +60,5 @@ gdf_error gdf_delete_transpose(gdf_graph *graph);
 gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess);
 
 gdf_error gdf_grmat_gen (const char* argv, size_t &vertices, size_t &edges, gdf_column* src, gdf_column* dest, gdf_column* val);
+
+gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_node, bool directed);
diff --git a/python/bfs/bfs_wrapper.cpp b/python/bfs/bfs_wrapper.cpp
new file mode 100644
index 00000000000..76e6447d377
--- /dev/null
+++ b/python/bfs/bfs_wrapper.cpp
@@ -0,0 +1,6885 @@
+/* Generated by Cython 0.28.5 */
+
+/* BEGIN: Cython Metadata
+{
+    "distutils": {
+        "depends": [],
+        "extra_compile_args": [
+            "-std=c++11"
+        ],
+        "include_dirs": [
+            "/home/jwyles/anaconda3/envs/cugraph_dev/lib/python3.5/site-packages/numpy/core/include",
+            "/home/jwyles/anaconda3/envs/cugraph_dev/include",
+            "src",
+            "include",
+            "../gunrock",
+            "../gunrock/externals/moderngpu/include",
+            "../gunrock/externals/cub"
+        ],
+        "language": "c++",
+        "libraries": [
+            "cugraph",
+            "cudf"
+        ],
+        "library_dirs": [
+            "/home/jwyles/anaconda3/envs/cugraph_dev/lib/python3.5/site-packages"
+        ],
+        "name": "cugraph",
+        "sources": [
+            "python/pagerank/pagerank_wrapper.pyx",
+            "python/bfs/bfs_wrapper.pyx"
+        ]
+    },
+    "module_name": "cugraph"
+}
+END: Cython Metadata */
+
+#define PY_SSIZE_T_CLEAN
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
+    #error Cython requires Python 2.6+ or Python 3.3+.
+#else
+#define CYTHON_ABI "0_28_5"
+#define CYTHON_FUTURE_DIVISION 0
+#include <stddef.h>
+#ifndef offsetof
+  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#define __PYX_COMMA ,
+#ifndef HAVE_LONG_LONG
+  #if PY_VERSION_HEX >= 0x02070000
+    #define HAVE_LONG_LONG
+  #endif
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+  #define CYTHON_COMPILING_IN_PYPY 1
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #undef CYTHON_USE_TYPE_SLOTS
+  #define CYTHON_USE_TYPE_SLOTS 0
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #if PY_VERSION_HEX < 0x03050000
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_INTERNALS
+  #define CYTHON_USE_UNICODE_INTERNALS 0
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #undef CYTHON_AVOID_BORROWED_REFS
+  #define CYTHON_AVOID_BORROWED_REFS 1
+  #undef CYTHON_ASSUME_SAFE_MACROS
+  #define CYTHON_ASSUME_SAFE_MACROS 0
+  #undef CYTHON_UNPACK_METHODS
+  #define CYTHON_UNPACK_METHODS 0
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#elif defined(PYSTON_VERSION)
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 1
+  #define CYTHON_COMPILING_IN_CPYTHON 0
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #undef CYTHON_USE_PYTYPE_LOOKUP
+  #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #undef CYTHON_USE_ASYNC_SLOTS
+  #define CYTHON_USE_ASYNC_SLOTS 0
+  #undef CYTHON_USE_PYLIST_INTERNALS
+  #define CYTHON_USE_PYLIST_INTERNALS 0
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #undef CYTHON_USE_UNICODE_WRITER
+  #define CYTHON_USE_UNICODE_WRITER 0
+  #undef CYTHON_USE_PYLONG_INTERNALS
+  #define CYTHON_USE_PYLONG_INTERNALS 0
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #undef CYTHON_FAST_THREAD_STATE
+  #define CYTHON_FAST_THREAD_STATE 0
+  #undef CYTHON_FAST_PYCALL
+  #define CYTHON_FAST_PYCALL 0
+  #undef CYTHON_PEP489_MULTI_PHASE_INIT
+  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
+  #undef CYTHON_USE_TP_FINALIZE
+  #define CYTHON_USE_TP_FINALIZE 0
+#else
+  #define CYTHON_COMPILING_IN_PYPY 0
+  #define CYTHON_COMPILING_IN_PYSTON 0
+  #define CYTHON_COMPILING_IN_CPYTHON 1
+  #ifndef CYTHON_USE_TYPE_SLOTS
+    #define CYTHON_USE_TYPE_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYTYPE_LOOKUP
+    #define CYTHON_USE_PYTYPE_LOOKUP 0
+  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
+    #define CYTHON_USE_PYTYPE_LOOKUP 1
+  #endif
+  #if PY_MAJOR_VERSION < 3
+    #undef CYTHON_USE_ASYNC_SLOTS
+    #define CYTHON_USE_ASYNC_SLOTS 0
+  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
+    #define CYTHON_USE_ASYNC_SLOTS 1
+  #endif
+  #if PY_VERSION_HEX < 0x02070000
+    #undef CYTHON_USE_PYLONG_INTERNALS
+    #define CYTHON_USE_PYLONG_INTERNALS 0
+  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
+    #define CYTHON_USE_PYLONG_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_PYLIST_INTERNALS
+    #define CYTHON_USE_PYLIST_INTERNALS 1
+  #endif
+  #ifndef CYTHON_USE_UNICODE_INTERNALS
+    #define CYTHON_USE_UNICODE_INTERNALS 1
+  #endif
+  #if PY_VERSION_HEX < 0x030300F0
+    #undef CYTHON_USE_UNICODE_WRITER
+    #define CYTHON_USE_UNICODE_WRITER 0
+  #elif !defined(CYTHON_USE_UNICODE_WRITER)
+    #define CYTHON_USE_UNICODE_WRITER 1
+  #endif
+  #ifndef CYTHON_AVOID_BORROWED_REFS
+    #define CYTHON_AVOID_BORROWED_REFS 0
+  #endif
+  #ifndef CYTHON_ASSUME_SAFE_MACROS
+    #define CYTHON_ASSUME_SAFE_MACROS 1
+  #endif
+  #ifndef CYTHON_UNPACK_METHODS
+    #define CYTHON_UNPACK_METHODS 1
+  #endif
+  #ifndef CYTHON_FAST_THREAD_STATE
+    #define CYTHON_FAST_THREAD_STATE 1
+  #endif
+  #ifndef CYTHON_FAST_PYCALL
+    #define CYTHON_FAST_PYCALL 1
+  #endif
+  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
+    #define CYTHON_PEP489_MULTI_PHASE_INIT (0 && PY_VERSION_HEX >= 0x03050000)
+  #endif
+  #ifndef CYTHON_USE_TP_FINALIZE
+    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
+  #endif
+#endif
+#if !defined(CYTHON_FAST_PYCCALL)
+#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
+#endif
+#if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+  #undef SHIFT
+  #undef BASE
+  #undef MASK
+#endif
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#ifndef __has_cpp_attribute
+  #define __has_cpp_attribute(x) 0
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+#ifndef CYTHON_MAYBE_UNUSED_VAR
+#  if defined(__cplusplus)
+     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
+#  else
+#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
+#  endif
+#endif
+#ifndef CYTHON_NCP_UNUSED
+# if CYTHON_COMPILING_IN_CPYTHON
+#  define CYTHON_NCP_UNUSED
+# else
+#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
+# endif
+#endif
+#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
+#ifdef _MSC_VER
+    #ifndef _MSC_STDINT_H_
+        #if _MSC_VER < 1300
+           typedef unsigned char     uint8_t;
+           typedef unsigned int      uint32_t;
+        #else
+           typedef unsigned __int8   uint8_t;
+           typedef unsigned __int32  uint32_t;
+        #endif
+    #endif
+#else
+   #include <stdint.h>
+#endif
+#ifndef CYTHON_FALLTHROUGH
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #if __has_cpp_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH [[fallthrough]]
+    #elif __has_cpp_attribute(clang::fallthrough)
+      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
+    #elif __has_cpp_attribute(gnu::fallthrough)
+      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
+    #endif
+  #endif
+  #ifndef CYTHON_FALLTHROUGH
+    #if __has_attribute(fallthrough)
+      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
+    #else
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+  #if defined(__clang__ ) && defined(__apple_build_version__)
+    #if __apple_build_version__ < 7000000
+      #undef  CYTHON_FALLTHROUGH
+      #define CYTHON_FALLTHROUGH
+    #endif
+  #endif
+#endif
+
+#ifndef __cplusplus
+  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__clang__)
+    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
+  #else
+    #define CYTHON_INLINE inline
+  #endif
+#endif
+template<typename T>
+void __Pyx_call_destructor(T& x) {
+    x.~T();
+}
+template<typename T>
+class __Pyx_FakeReference {
+  public:
+    __Pyx_FakeReference() : ptr(NULL) { }
+    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
+    T *operator->() { return ptr; }
+    T *operator&() { return ptr; }
+    operator T&() { return *ptr; }
+    template<typename U> bool operator ==(U other) { return *ptr == other; }
+    template<typename U> bool operator !=(U other) { return *ptr != other; }
+  private:
+    T *ptr;
+};
+
+#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
+  #define Py_OptimizeFlag 0
+#endif
+#define __PYX_BUILD_PY_SSIZE_T "n"
+#define CYTHON_FORMAT_SSIZE_T "z"
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#ifndef Py_TPFLAGS_CHECKTYPES
+  #define Py_TPFLAGS_CHECKTYPES 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_INDEX
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#ifndef Py_TPFLAGS_HAVE_FINALIZE
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
+  #ifndef METH_FASTCALL
+     #define METH_FASTCALL 0x80
+  #endif
+  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
+  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
+                                                          Py_ssize_t nargs, PyObject *kwnames);
+#else
+  #define __Pyx_PyCFunctionFast _PyCFunctionFast
+  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
+#endif
+#if CYTHON_FAST_PYCCALL
+#define __Pyx_PyFastCFunction_Check(func)\
+    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)))))
+#else
+#define __Pyx_PyFastCFunction_Check(func) 0
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
+  #define PyObject_Malloc(s)   PyMem_Malloc(s)
+  #define PyObject_Free(p)     PyMem_Free(p)
+  #define PyObject_Realloc(p)  PyMem_Realloc(p)
+#endif
+#if CYTHON_COMPILING_IN_PYSTON
+  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
+#else
+  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
+  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
+#endif
+#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#elif PY_VERSION_HEX >= 0x03060000
+  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
+#elif PY_VERSION_HEX >= 0x03000000
+  #define __Pyx_PyThreadState_Current PyThreadState_GET()
+#else
+  #define __Pyx_PyThreadState_Current _PyThreadState_Current
+#endif
+#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
+#include "pythread.h"
+#define Py_tss_NEEDS_INIT 0
+typedef int Py_tss_t;
+static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
+  *key = PyThread_create_key();
+  return 0; // PyThread_create_key reports success always
+}
+static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
+  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
+  *key = Py_tss_NEEDS_INIT;
+  return key;
+}
+static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
+  PyObject_Free(key);
+}
+static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
+  return *key != Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
+  PyThread_delete_key(*key);
+  *key = Py_tss_NEEDS_INIT;
+}
+static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
+  return PyThread_set_key_value(*key, value);
+}
+static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
+  return PyThread_get_key_value(*key);
+}
+#endif // TSS (Thread Specific Storage) API
+#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
+#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
+#else
+#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
+#endif
+#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
+#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
+#else
+#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define PyUnicode_1BYTE_KIND  1
+  #define PyUnicode_2BYTE_KIND  2
+  #define PyUnicode_4BYTE_KIND  4
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
+  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
+  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
+#endif
+#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
+  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
+  #define PyObject_ASCII(o)            PyObject_Repr(o)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+  #define PyObject_Unicode             PyObject_Str
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#if CYTHON_ASSUME_SAFE_MACROS
+  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
+#else
+  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
+  #ifndef PyUnicode_InternFromString
+    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
+  #endif
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func))
+#else
+  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
+#endif
+#if CYTHON_USE_ASYNC_SLOTS
+  #if PY_VERSION_HEX >= 0x030500B1
+    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
+    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
+  #else
+    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
+  #endif
+#else
+  #define __Pyx_PyType_AsAsync(obj) NULL
+#endif
+#ifndef __Pyx_PyAsyncMethodsStruct
+    typedef struct {
+        unaryfunc am_await;
+        unaryfunc am_aiter;
+        unaryfunc am_anext;
+    } __Pyx_PyAsyncMethodsStruct;
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+  #define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
+#define __Pyx_truncl trunc
+#else
+#define __Pyx_truncl truncl
+#endif
+
+
+#define __PYX_ERR(f_index, lineno, Ln_error) \
+{ \
+  __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
+}
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#define __PYX_HAVE__bfs_wrapper
+#define __PYX_HAVE_API__bfs_wrapper
+/* Early includes */
+#include "cudf.h"
+#include "cugraph.h"
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_uchar_cast(c) ((unsigned char)c)
+#define __Pyx_long_cast(x) ((long)x)
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
+    (sizeof(type) < sizeof(Py_ssize_t))  ||\
+    (sizeof(type) > sizeof(Py_ssize_t) &&\
+          likely(v < (type)PY_SSIZE_T_MAX ||\
+                 v == (type)PY_SSIZE_T_MAX)  &&\
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
+                                v == (type)PY_SSIZE_T_MIN)))  ||\
+    (sizeof(type) == sizeof(Py_ssize_t) &&\
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
+                               v == (type)PY_SSIZE_T_MAX)))  )
+#if defined (__cplusplus) && __cplusplus >= 201103L
+    #include <cstdlib>
+    #define __Pyx_sst_abs(value) std::abs(value)
+#elif SIZEOF_INT >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) abs(value)
+#elif SIZEOF_LONG >= SIZEOF_SIZE_T
+    #define __Pyx_sst_abs(value) labs(value)
+#elif defined (_MSC_VER)
+    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define __Pyx_sst_abs(value) llabs(value)
+#elif defined (__GNUC__)
+    #define __Pyx_sst_abs(value) __builtin_llabs(value)
+#else
+    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
+#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
+#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
+#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
+#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
+#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return (size_t)(u_end - u - 1);
+}
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
+#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
+#define __Pyx_PySequence_Tuple(obj)\
+    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_ASSUME_SAFE_MACROS
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION >= 3
+#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
+#else
+#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
+#endif
+#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    const char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    if (strcmp(default_encoding_c, "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (!ascii_chars_u) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+        Py_DECREF(ascii_chars_u);
+        Py_DECREF(ascii_chars_b);
+    }
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (!sys) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    Py_DECREF(sys);
+    if (!default_encoding) goto bad;
+    default_encoding_c = PyBytes_AsString(default_encoding);
+    if (!default_encoding_c) goto bad;
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+/* Test for GCC > 2.95 */
+#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
+  #define likely(x)   __builtin_expect(!!(x), 1)
+  #define unlikely(x) __builtin_expect(!!(x), 0)
+#else /* !__GNUC__ or GCC < 2.95 */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
+
+static PyObject *__pyx_m = NULL;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_cython_runtime = NULL;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static PyObject *__pyx_empty_unicode;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+
+static const char *__pyx_f[] = {
+  "python/bfs/bfs_wrapper.pyx",
+};
+
+/*--- Type declarations ---*/
+struct __pyx_opt_args_11bfs_wrapper_bfs;
+
+/* "bfs_wrapper.pyx":152
+ *         gdf_add_transpose(<gdf_graph*>graph)
+ * 
+ * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
+ *     """
+ *     Find the distances and predecessors for a breadth first traversal of a graph.
+ */
+struct __pyx_opt_args_11bfs_wrapper_bfs {
+  int __pyx_n;
+  PyObject *directed;
+};
+
+/* --- Runtime support code (head) --- */
+/* Refnanny.proto */
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          if (acquire_gil) {\
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+              PyGILState_Release(__pyx_gilstate_save);\
+          } else {\
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext()\
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif
+#define __Pyx_XDECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_XDECREF(tmp);\
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {\
+        PyObject *tmp = (PyObject *) r;\
+        r = v; __Pyx_DECREF(tmp);\
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+/* PyObjectGetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+/* GetBuiltinName.proto */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name);
+
+/* GetModuleGlobalName.proto */
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
+
+/* PyCFunctionFastCall.proto */
+#if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
+#else
+#define __Pyx_PyCFunction_FastCall(func, args, nargs)  (assert(0), NULL)
+#endif
+
+/* PyFunctionFastCall.proto */
+#if CYTHON_FAST_PYCALL
+#define __Pyx_PyFunction_FastCall(func, args, nargs)\
+    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
+#else
+#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
+#endif
+#endif
+
+/* PyObjectCall.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+/* PyObjectCallMethO.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
+#endif
+
+/* PyObjectCallOneArg.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+/* PyObjectCallNoArg.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
+#else
+#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL)
+#endif
+
+/* GetItemInt.proto */
+#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
+    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
+               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
+#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
+    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
+    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
+    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              int wraparound, int boundscheck);
+static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
+                                                     int is_list, int wraparound, int boundscheck);
+
+/* ObjectGetItem.proto */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key);
+#else
+#define __Pyx_PyObject_GetItem(obj, key)  PyObject_GetItem(obj, key)
+#endif
+
+/* PyObjectSetAttrStr.proto */
+#if CYTHON_USE_TYPE_SLOTS
+#define __Pyx_PyObject_DelAttrStr(o,n) __Pyx_PyObject_SetAttrStr(o, n, NULL)
+static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value);
+#else
+#define __Pyx_PyObject_DelAttrStr(o,n)   PyObject_DelAttr(o,n)
+#define __Pyx_PyObject_SetAttrStr(o,n,v) PyObject_SetAttr(o,n,v)
+#endif
+
+/* RaiseArgTupleInvalid.proto */
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
+
+/* RaiseDoubleKeywords.proto */
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
+
+/* ParseKeywords.proto */
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
+    const char* function_name);
+
+/* Import.proto */
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
+
+/* ImportFrom.proto */
+static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
+
+/* FetchCommonType.proto */
+static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
+
+/* CythonFunction.proto */
+#define __Pyx_CyFunction_USED 1
+#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
+#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
+#define __Pyx_CYFUNCTION_CCLASS        0x04
+#define __Pyx_CyFunction_GetClosure(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_closure)
+#define __Pyx_CyFunction_GetClassObj(f)\
+    (((__pyx_CyFunctionObject *) (f))->func_classobj)
+#define __Pyx_CyFunction_Defaults(type, f)\
+    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
+#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
+    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
+typedef struct {
+    PyCFunctionObject func;
+#if PY_VERSION_HEX < 0x030500A0
+    PyObject *func_weakreflist;
+#endif
+    PyObject *func_dict;
+    PyObject *func_name;
+    PyObject *func_qualname;
+    PyObject *func_doc;
+    PyObject *func_globals;
+    PyObject *func_code;
+    PyObject *func_closure;
+    PyObject *func_classobj;
+    void *defaults;
+    int defaults_pyobjects;
+    int flags;
+    PyObject *defaults_tuple;
+    PyObject *defaults_kwdict;
+    PyObject *(*defaults_getter)(PyObject *);
+    PyObject *func_annotations;
+} __pyx_CyFunctionObject;
+static PyTypeObject *__pyx_CyFunctionType = 0;
+#define __Pyx_CyFunction_NewEx(ml, flags, qualname, self, module, globals, code)\
+    __Pyx_CyFunction_New(__pyx_CyFunctionType, ml, flags, qualname, self, module, globals, code)
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *, PyMethodDef *ml,
+                                      int flags, PyObject* qualname,
+                                      PyObject *self,
+                                      PyObject *module, PyObject *globals,
+                                      PyObject* code);
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
+                                                         size_t size,
+                                                         int pyobjects);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
+                                                            PyObject *tuple);
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
+                                                             PyObject *dict);
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
+                                                              PyObject *dict);
+static int __pyx_CyFunction_init(void);
+
+/* SetNameInClass.proto */
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
+#define __Pyx_SetNameInClass(ns, name, value)\
+    (likely(PyDict_CheckExact(ns)) ? _PyDict_SetItem_KnownHash(ns, name, value, ((PyASCIIObject *) name)->hash) : PyObject_SetItem(ns, name, value))
+#elif CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_SetNameInClass(ns, name, value)\
+    (likely(PyDict_CheckExact(ns)) ? PyDict_SetItem(ns, name, value) : PyObject_SetItem(ns, name, value))
+#else
+#define __Pyx_SetNameInClass(ns, name, value)  PyObject_SetItem(ns, name, value)
+#endif
+
+/* CalculateMetaclass.proto */
+static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases);
+
+/* Py3ClassCreate.proto */
+static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name, PyObject *qualname,
+                                           PyObject *mkw, PyObject *modname, PyObject *doc);
+static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases, PyObject *dict,
+                                      PyObject *mkw, int calculate_metaclass, int allow_py2_metaclass);
+
+/* PyThreadStateGet.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
+#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
+#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
+#else
+#define __Pyx_PyThreadState_declare
+#define __Pyx_PyThreadState_assign
+#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
+#endif
+
+/* PyErrFetchRestore.proto */
+#if CYTHON_FAST_THREAD_STATE
+#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
+#else
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#endif
+#else
+#define __Pyx_PyErr_Clear() PyErr_Clear()
+#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
+#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
+#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
+#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
+#endif
+
+/* CLineInTraceback.proto */
+#ifdef CYTHON_CLINE_IN_TRACEBACK
+#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
+#else
+static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
+#endif
+
+/* CodeObjectCache.proto */
+typedef struct {
+    PyCodeObject* code_object;
+    int code_line;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+/* AddTraceback.proto */
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_gdf_dtype(gdf_dtype value);
+
+/* Print.proto */
+static int __Pyx_Print(PyObject*, PyObject *, int);
+#if CYTHON_COMPILING_IN_PYPY || PY_MAJOR_VERSION >= 3
+static PyObject* __pyx_print = 0;
+static PyObject* __pyx_print_kwargs = 0;
+#endif
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE gdf_dtype __Pyx_PyInt_As_gdf_dtype(PyObject *);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+/* PrintOne.proto */
+static int __Pyx_PrintOne(PyObject* stream, PyObject *o);
+
+/* CIntToPy.proto */
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+/* CIntFromPy.proto */
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+/* FastTypeChecks.proto */
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
+#else
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
+#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
+#endif
+#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
+
+/* CheckBinaryVersion.proto */
+static int __Pyx_check_binary_version(void);
+
+/* InitStrings.proto */
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
+
+
+/* Module declarations from 'libcpp' */
+
+/* Module declarations from 'c_bfs' */
+
+/* Module declarations from 'libc.stdint' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdlib' */
+
+/* Module declarations from 'bfs_wrapper' */
+static PyObject *__pyx_f_11bfs_wrapper_create_column(PyObject *); /*proto*/
+static PyObject *__pyx_f_11bfs_wrapper_bfs(PyObject *, PyObject *, int __pyx_skip_dispatch, struct __pyx_opt_args_11bfs_wrapper_bfs *__pyx_optional_args); /*proto*/
+#define __Pyx_MODULE_NAME "bfs_wrapper"
+extern int __pyx_module_is_main_bfs_wrapper;
+int __pyx_module_is_main_bfs_wrapper = 0;
+
+/* Implementation of 'bfs_wrapper' */
+static const char __pyx_k_G[] = "G";
+static const char __pyx_k_g[] = "g";
+static const char __pyx_k_np[] = "np";
+static const char __pyx_k_doc[] = "__doc__";
+static const char __pyx_k_end[] = "end";
+static const char __pyx_k_gdf[] = "_gdf";
+static const char __pyx_k_obj[] = "obj";
+static const char __pyx_k_rmm[] = "rmm";
+static const char __pyx_k_cudf[] = "cudf";
+static const char __pyx_k_data[] = "_data";
+static const char __pyx_k_dest[] = "dest";
+static const char __pyx_k_file[] = "file";
+static const char __pyx_k_init[] = "__init__";
+static const char __pyx_k_main[] = "__main__";
+static const char __pyx_k_mask[] = "_mask";
+static const char __pyx_k_self[] = "self";
+static const char __pyx_k_size[] = "size";
+static const char __pyx_k_test[] = "__test__";
+static const char __pyx_k_type[] = "type";
+static const char __pyx_k_Graph[] = "Graph";
+static const char __pyx_k_dtype[] = "dtype";
+static const char __pyx_k_graph[] = "graph";
+static const char __pyx_k_int32[] = "int32";
+static const char __pyx_k_int64[] = "int64";
+static const char __pyx_k_numpy[] = "numpy";
+static const char __pyx_k_print[] = "print";
+static const char __pyx_k_start[] = "start";
+static const char __pyx_k_value[] = "value";
+static const char __pyx_k_zeros[] = "zeros";
+static const char __pyx_k_Series[] = "Series";
+static const char __pyx_k_column[] = "_column";
+static const char __pyx_k_data_2[] = "data";
+static const char __pyx_k_dtypes[] = "dtypes";
+static const char __pyx_k_import[] = "__import__";
+static const char __pyx_k_librmm[] = "librmm";
+static const char __pyx_k_module[] = "__module__";
+static const char __pyx_k_source[] = "source";
+static const char __pyx_k_float32[] = "float32";
+static const char __pyx_k_float64[] = "float64";
+static const char __pyx_k_indices[] = "indices";
+static const char __pyx_k_offsets[] = "offsets";
+static const char __pyx_k_prepare[] = "__prepare__";
+static const char __pyx_k_dest_col[] = "dest_col";
+static const char __pyx_k_directed[] = "directed";
+static const char __pyx_k_qualname[] = "__qualname__";
+static const char __pyx_k_cffi_view[] = "cffi_view";
+static const char __pyx_k_graph_ptr[] = "graph_ptr";
+static const char __pyx_k_metaclass[] = "__metaclass__";
+static const char __pyx_k_value_col[] = "value_col";
+static const char __pyx_k_null_count[] = "null_count";
+static const char __pyx_k_source_col[] = "source_col";
+static const char __pyx_k_bfs_wrapper[] = "bfs_wrapper";
+static const char __pyx_k_indices_col[] = "indices_col";
+static const char __pyx_k_librmm_cffi[] = "librmm_cffi";
+static const char __pyx_k_offsets_col[] = "offsets_col";
+static const char __pyx_k_Graph___init[] = "Graph.__init__";
+static const char __pyx_k_add_adj_list[] = "add_adj_list";
+static const char __pyx_k_bfs_line_152[] = "bfs (line 152)";
+static const char __pyx_k_to_gpu_array[] = "to_gpu_array";
+static const char __pyx_k_add_edge_list[] = "add_edge_list";
+static const char __pyx_k_add_transpose[] = "add_transpose";
+static const char __pyx_k_get_ctype_ptr[] = "_get_ctype_ptr";
+static const char __pyx_k_view_edge_list[] = "view_edge_list";
+static const char __pyx_k_Graph_add_adj_list[] = "Graph.add_adj_list";
+static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
+static const char __pyx_k_Graph_add_edge_list[] = "Graph.add_edge_list";
+static const char __pyx_k_Graph_add_transpose[] = "Graph.add_transpose";
+static const char __pyx_k_get_column_data_ptr[] = "_get_column_data_ptr";
+static const char __pyx_k_Graph_view_edge_list[] = "Graph.view_edge_list";
+static const char __pyx_k_get_column_valid_ptr[] = "_get_column_valid_ptr";
+static const char __pyx_k_device_ctypes_pointer[] = "device_ctypes_pointer";
+static const char __pyx_k_Graph___init___line_48[] = "Graph.__init__ (line 48)";
+static const char __pyx_k_cffi_view_to_column_mem[] = "cffi_view_to_column_mem";
+static const char __pyx_k_python_bfs_bfs_wrapper_pyx[] = "python/bfs/bfs_wrapper.pyx";
+static const char __pyx_k_Graph_add_edge_list_line_66[] = "Graph.add_edge_list (line 66)";
+static const char __pyx_k_cuGraph_graph_class_containing[] = "\n        cuGraph graph class containing basic graph creation and transformation operations.\n    ";
+static const char __pyx_k_Find_the_distances_and_predeces[] = "\n    Find the distances and predecessors for a breadth first traversal of a graph.\n    \n    Parameters\n    ----------\n    G : cugraph.graph\n        cuGraph graph descriptor, should contain the connectivity information as an\n        adjacency list.\n    start : Integer\n        The index of the graph vertex from which the traversal begins\n    directed : bool\n        Indicates whether the graph in question is a directed graph, or whether\n        each edge has a corresponding reverse edge. (Allows optimizations if the\n        graph is undirected)\n    \n    Returns\n    -------\n    distances, predecessors : cudf.Series\n        distances gives the path distance for each vertex from the starting vertex\n        predecessors gives for each vertex the vertex it was reached from in the traversal\n        \n    Examples\n    --------\n    >>> M = ReadMtxFile(graph_file)\n    >>> sources = cudf.Series(M.row)\n    >>> destinations = cudf.Series(M.col)\n    >>> G = cuGraph.Graph()\n    >>> G.add_edge_list(sources,destinations,none)\n    >>> dist, pred = cuGraph.bfs(G, 0, false)\n    ";
+static const char __pyx_k_Returns_Graph_cuGraph_Graph_Exa[] = "\n        Returns\n        -------\n        Graph : cuGraph.Graph.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> G = cuGraph.Graph()\n        ";
+static const char __pyx_k_Warp_existing_gdf_columns_repre[] = "\n        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. \n        The cuGraph graph should not already contain the connectivity information as an edge list.\n        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).\n\n        Parameters\n        ----------\n        source_indices : gdf_column       \n            This gdf_column of size E (number of edges) contains the index of the source for each edge.\n            Indices must be in the range [0, V-1]. \n        destination_indices   : gdf_column\n            This gdf_column of size E (number of edges) contains the index of the destination for each edge. \n            Indices must be in the range [0, V-1].\n        edge_data (optional)  : gdf_column\n            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. \n            The type expected to be floating point.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> import cudf\n        >>> from scipy.io import mmread\n        >>> M = ReadMtxFile(graph_file)\n        >>> sources = cudf.Series(M.row)\n        >>> destinations = cudf.Series(M.col)\n        >>> G = cuGraph.Graph()\n        >>> G.add_edge_list(sources,destinations,none)\n        \n        ";
+static PyObject *__pyx_kp_u_Find_the_distances_and_predeces;
+static PyObject *__pyx_n_s_G;
+static PyObject *__pyx_n_s_Graph;
+static PyObject *__pyx_n_s_Graph___init;
+static PyObject *__pyx_kp_u_Graph___init___line_48;
+static PyObject *__pyx_n_s_Graph_add_adj_list;
+static PyObject *__pyx_n_s_Graph_add_edge_list;
+static PyObject *__pyx_kp_u_Graph_add_edge_list_line_66;
+static PyObject *__pyx_n_s_Graph_add_transpose;
+static PyObject *__pyx_n_s_Graph_view_edge_list;
+static PyObject *__pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa;
+static PyObject *__pyx_n_s_Series;
+static PyObject *__pyx_kp_u_Warp_existing_gdf_columns_repre;
+static PyObject *__pyx_n_s_add_adj_list;
+static PyObject *__pyx_n_s_add_edge_list;
+static PyObject *__pyx_n_s_add_transpose;
+static PyObject *__pyx_kp_u_bfs_line_152;
+static PyObject *__pyx_n_s_bfs_wrapper;
+static PyObject *__pyx_n_s_cffi_view;
+static PyObject *__pyx_n_s_cffi_view_to_column_mem;
+static PyObject *__pyx_n_s_cline_in_traceback;
+static PyObject *__pyx_n_s_column;
+static PyObject *__pyx_kp_s_cuGraph_graph_class_containing;
+static PyObject *__pyx_n_s_cudf;
+static PyObject *__pyx_n_s_data;
+static PyObject *__pyx_n_s_data_2;
+static PyObject *__pyx_n_s_dest;
+static PyObject *__pyx_n_s_dest_col;
+static PyObject *__pyx_n_s_device_ctypes_pointer;
+static PyObject *__pyx_n_s_directed;
+static PyObject *__pyx_n_s_doc;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_dtypes;
+static PyObject *__pyx_n_s_end;
+static PyObject *__pyx_n_s_file;
+static PyObject *__pyx_n_s_float32;
+static PyObject *__pyx_n_s_float64;
+static PyObject *__pyx_n_s_g;
+static PyObject *__pyx_n_s_gdf;
+static PyObject *__pyx_n_s_get_column_data_ptr;
+static PyObject *__pyx_n_s_get_column_valid_ptr;
+static PyObject *__pyx_n_s_get_ctype_ptr;
+static PyObject *__pyx_n_s_graph;
+static PyObject *__pyx_n_s_graph_ptr;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_indices;
+static PyObject *__pyx_n_s_indices_col;
+static PyObject *__pyx_n_s_init;
+static PyObject *__pyx_n_s_int32;
+static PyObject *__pyx_n_s_int64;
+static PyObject *__pyx_n_s_librmm;
+static PyObject *__pyx_n_s_librmm_cffi;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_n_s_mask;
+static PyObject *__pyx_n_s_metaclass;
+static PyObject *__pyx_n_s_module;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_null_count;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_n_s_obj;
+static PyObject *__pyx_n_s_offsets;
+static PyObject *__pyx_n_s_offsets_col;
+static PyObject *__pyx_n_s_prepare;
+static PyObject *__pyx_n_s_print;
+static PyObject *__pyx_kp_s_python_bfs_bfs_wrapper_pyx;
+static PyObject *__pyx_n_s_qualname;
+static PyObject *__pyx_n_s_rmm;
+static PyObject *__pyx_n_s_self;
+static PyObject *__pyx_n_s_size;
+static PyObject *__pyx_n_s_source;
+static PyObject *__pyx_n_s_source_col;
+static PyObject *__pyx_n_s_start;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_to_gpu_array;
+static PyObject *__pyx_n_s_type;
+static PyObject *__pyx_n_s_value;
+static PyObject *__pyx_n_s_value_col;
+static PyObject *__pyx_n_s_view_edge_list;
+static PyObject *__pyx_n_s_zeros;
+static PyObject *__pyx_pf_11bfs_wrapper__get_ctype_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_2_get_column_data_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_source_col, PyObject *__pyx_v_dest_col, PyObject *__pyx_v_value_col); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_offsets_col, PyObject *__pyx_v_indices_col, PyObject *__pyx_v_value_col); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_8add_transpose(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
+static PyObject *__pyx_pf_11bfs_wrapper_6bfs(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_G, PyObject *__pyx_v_start, PyObject *__pyx_v_directed); /* proto */
+static PyObject *__pyx_int_0;
+static PyObject *__pyx_tuple_;
+static PyObject *__pyx_tuple__3;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__12;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_codeobj__2;
+static PyObject *__pyx_codeobj__4;
+static PyObject *__pyx_codeobj__6;
+static PyObject *__pyx_codeobj__8;
+static PyObject *__pyx_codeobj__10;
+static PyObject *__pyx_codeobj__13;
+static PyObject *__pyx_codeobj__15;
+static PyObject *__pyx_codeobj__17;
+/* Late includes */
+
+/* "bfs_wrapper.pyx":12
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+ * 
+ * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
+ *     # The manner to access the pointers in the gdf's might change, so
+ *     # encapsulating access in the following 3 methods. They might also be
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_1_get_ctype_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
+static PyMethodDef __pyx_mdef_11bfs_wrapper_1_get_ctype_ptr = {"_get_ctype_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_1_get_ctype_ptr, METH_O, 0};
+static PyObject *__pyx_pw_11bfs_wrapper_1_get_ctype_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_ctype_ptr (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper__get_ctype_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper__get_ctype_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  __Pyx_RefNannySetupContext("_get_ctype_ptr", 0);
+
+  /* "bfs_wrapper.pyx":16
+ *     # encapsulating access in the following 3 methods. They might also be
+ *     # part of future gdf versions.
+ *     return obj.device_ctypes_pointer.value             # <<<<<<<<<<<<<<
+ * 
+ * def _get_column_data_ptr(obj):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_device_ctypes_pointer); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":12
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+ * 
+ * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
+ *     # The manner to access the pointers in the gdf's might change, so
+ *     # encapsulating access in the following 3 methods. They might also be
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_AddTraceback("bfs_wrapper._get_ctype_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":18
+ *     return obj.device_ctypes_pointer.value
+ * 
+ * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_3_get_column_data_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
+static PyMethodDef __pyx_mdef_11bfs_wrapper_3_get_column_data_ptr = {"_get_column_data_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_3_get_column_data_ptr, METH_O, 0};
+static PyObject *__pyx_pw_11bfs_wrapper_3_get_column_data_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_column_data_ptr (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper_2_get_column_data_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_2_get_column_data_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("_get_column_data_ptr", 0);
+
+  /* "bfs_wrapper.pyx":19
+ * 
+ * def _get_column_data_ptr(obj):
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())             # <<<<<<<<<<<<<<
+ * 
+ * def _get_column_valid_ptr(obj):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_ctype_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_column); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_data); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 19, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_gpu_array); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = NULL;
+  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
+    if (likely(__pyx_t_5)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+      __Pyx_INCREF(__pyx_t_5);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_4, function);
+    }
+  }
+  if (__pyx_t_5) {
+    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 19, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else {
+    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 19, __pyx_L1_error)
+  }
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 19, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_GIVEREF(__pyx_t_3);
+      PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
+      __pyx_t_3 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":18
+ *     return obj.device_ctypes_pointer.value
+ * 
+ * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("bfs_wrapper._get_column_data_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":21
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
+ * 
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5_get_column_valid_ptr = {"_get_column_valid_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr, METH_O, 0};
+static PyObject *__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("_get_column_valid_ptr (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  __Pyx_RefNannySetupContext("_get_column_valid_ptr", 0);
+
+  /* "bfs_wrapper.pyx":22
+ * 
+ * def _get_column_valid_ptr(obj):
+ *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())             # <<<<<<<<<<<<<<
+ * 
+ * #def _get_gdf_as_matrix_ptr(gdf):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_ctype_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 22, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_column); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 22, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_mask); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 22, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_gpu_array); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 22, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = NULL;
+  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
+    if (likely(__pyx_t_5)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+      __Pyx_INCREF(__pyx_t_5);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_4, function);
+    }
+  }
+  if (__pyx_t_5) {
+    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 22, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  } else {
+    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 22, __pyx_L1_error)
+  }
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 22, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_5);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_GIVEREF(__pyx_t_3);
+      PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
+      __pyx_t_3 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":21
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_AddTraceback("bfs_wrapper._get_column_valid_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":27
+ * #    return self._get_ctype_ptr(gdf.as_gpu_matrix())
+ * 
+ * cdef create_column(col):             # <<<<<<<<<<<<<<
+ * 
+ *     x= <gdf_column*>malloc(sizeof(gdf_column))
+ */
+
+static PyObject *__pyx_f_11bfs_wrapper_create_column(PyObject *__pyx_v_col) {
+  CYTHON_UNUSED gdf_column *__pyx_v_x;
+  gdf_column *__pyx_v_c_col;
+  uintptr_t __pyx_v_data_ptr;
+  uintptr_t __pyx_v_col_ptr;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  uintptr_t __pyx_t_5;
+  Py_ssize_t __pyx_t_6;
+  gdf_dtype __pyx_t_7;
+  gdf_size_type __pyx_t_8;
+  __Pyx_RefNannySetupContext("create_column", 0);
+
+  /* "bfs_wrapper.pyx":29
+ * cdef create_column(col):
+ * 
+ *     x= <gdf_column*>malloc(sizeof(gdf_column))             # <<<<<<<<<<<<<<
+ *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
+ *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)
+ */
+  __pyx_v_x = ((gdf_column *)malloc((sizeof(gdf_column))));
+
+  /* "bfs_wrapper.pyx":30
+ * 
+ *     x= <gdf_column*>malloc(sizeof(gdf_column))
+ *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))             # <<<<<<<<<<<<<<
+ *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)
+ *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
+ */
+  __pyx_v_c_col = ((gdf_column *)malloc((sizeof(gdf_column))));
+
+  /* "bfs_wrapper.pyx":31
+ *     x= <gdf_column*>malloc(sizeof(gdf_column))
+ *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
+ *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)             # <<<<<<<<<<<<<<
+ *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
+ * 
+ */
+  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_column_data_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 31, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_2, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_col};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_col};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 31, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_INCREF(__pyx_v_col);
+      __Pyx_GIVEREF(__pyx_v_col);
+      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_col);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_5 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_5 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 31, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_data_ptr = __pyx_t_5;
+
+  /* "bfs_wrapper.pyx":37
+ *                               <void*> data_ptr,
+ *                               <gdf_valid_type*> 0,
+ *                               <gdf_size_type>len(col),             # <<<<<<<<<<<<<<
+ *                               dtypes[col.dtype.type],
+ *                               <gdf_size_type>col.null_count)
+ */
+  __pyx_t_6 = PyObject_Length(__pyx_v_col); if (unlikely(__pyx_t_6 == ((Py_ssize_t)-1))) __PYX_ERR(0, 37, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":38
+ *                               <gdf_valid_type*> 0,
+ *                               <gdf_size_type>len(col),
+ *                               dtypes[col.dtype.type],             # <<<<<<<<<<<<<<
+ *                               <gdf_size_type>col.null_count)
+ * 
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_dtypes); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 38, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_col, __pyx_n_s_dtype); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 38, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_type); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 38, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 38, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_7 = ((gdf_dtype)__Pyx_PyInt_As_gdf_dtype(__pyx_t_2)); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 38, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":39
+ *                               <gdf_size_type>len(col),
+ *                               dtypes[col.dtype.type],
+ *                               <gdf_size_type>col.null_count)             # <<<<<<<<<<<<<<
+ * 
+ *     cdef uintptr_t col_ptr = <uintptr_t>c_col
+ */
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_col, __pyx_n_s_null_count); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 39, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_8 = __Pyx_PyInt_As_size_t(__pyx_t_2); if (unlikely((__pyx_t_8 == ((gdf_size_type)-1)) && PyErr_Occurred())) __PYX_ERR(0, 39, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":34
+ *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
+ * 
+ *     gdf_column_view_augmented(<gdf_column*>c_col,             # <<<<<<<<<<<<<<
+ *                               <void*> data_ptr,
+ *                               <gdf_valid_type*> 0,
+ */
+  (void)(gdf_column_view_augmented(((gdf_column *)__pyx_v_c_col), ((void *)__pyx_v_data_ptr), ((gdf_valid_type *)0), ((gdf_size_type)__pyx_t_6), __pyx_t_7, ((gdf_size_type)__pyx_t_8)));
+
+  /* "bfs_wrapper.pyx":41
+ *                               <gdf_size_type>col.null_count)
+ * 
+ *     cdef uintptr_t col_ptr = <uintptr_t>c_col             # <<<<<<<<<<<<<<
+ *     return col_ptr
+ * 
+ */
+  __pyx_v_col_ptr = ((uintptr_t)__pyx_v_c_col);
+
+  /* "bfs_wrapper.pyx":42
+ * 
+ *     cdef uintptr_t col_ptr = <uintptr_t>c_col
+ *     return col_ptr             # <<<<<<<<<<<<<<
+ * 
+ * class Graph:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_v_col_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 42, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_r = __pyx_t_2;
+  __pyx_t_2 = 0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":27
+ * #    return self._get_ctype_ptr(gdf.as_gpu_matrix())
+ * 
+ * cdef create_column(col):             # <<<<<<<<<<<<<<
+ * 
+ *     x= <gdf_column*>malloc(sizeof(gdf_column))
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("bfs_wrapper.create_column", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":48
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ *     """
+ *     def __init__(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Returns
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_1__init__(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
+static char __pyx_doc_11bfs_wrapper_5Graph___init__[] = "\n        Returns\n        -------\n        Graph : cuGraph.Graph.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> G = cuGraph.Graph()\n        ";
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_1__init__ = {"__init__", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_1__init__, METH_O, __pyx_doc_11bfs_wrapper_5Graph___init__};
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_1__init__(PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph___init__(__pyx_self, ((PyObject *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  struct gdf_graph *__pyx_v_graph;
+  uintptr_t __pyx_v_graph_ptr;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  __Pyx_RefNannySetupContext("__init__", 0);
+
+  /* "bfs_wrapper.pyx":60
+ *         """
+ *         cdef gdf_graph* graph
+ *         graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))             # <<<<<<<<<<<<<<
+ * 
+ *         cdef uintptr_t graph_ptr = <uintptr_t>graph
+ */
+  __pyx_v_graph = ((struct gdf_graph *)calloc(1, (sizeof(struct gdf_graph))));
+
+  /* "bfs_wrapper.pyx":62
+ *         graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))
+ * 
+ *         cdef uintptr_t graph_ptr = <uintptr_t>graph             # <<<<<<<<<<<<<<
+ *         self.graph_ptr = graph_ptr
+ * 
+ */
+  __pyx_v_graph_ptr = ((uintptr_t)__pyx_v_graph);
+
+  /* "bfs_wrapper.pyx":63
+ * 
+ *         cdef uintptr_t graph_ptr = <uintptr_t>graph
+ *         self.graph_ptr = graph_ptr             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyInt_FromSize_t(__pyx_v_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 63, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_PyObject_SetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr, __pyx_t_1) < 0) __PYX_ERR(0, 63, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":48
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ *     """
+ *     def __init__(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Returns
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("bfs_wrapper.Graph.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":66
+ * 
+ * 
+ *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_11bfs_wrapper_5Graph_2add_edge_list[] = "\n        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. \n        The cuGraph graph should not already contain the connectivity information as an edge list.\n        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).\n\n        Parameters\n        ----------\n        source_indices : gdf_column       \n            This gdf_column of size E (number of edges) contains the index of the source for each edge.\n            Indices must be in the range [0, V-1]. \n        destination_indices   : gdf_column\n            This gdf_column of size E (number of edges) contains the index of the destination for each edge. \n            Indices must be in the range [0, V-1].\n        edge_data (optional)  : gdf_column\n            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. \n            The type expected to be floating point.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> import cudf\n        >>> from scipy.io import mmread\n        >>> M = ReadMtxFile(graph_file)\n        >>> sources = cudf.Series(M.row)\n        >>> destinations = cudf.Series(M.col)\n        >>> G = cuGraph.Graph()\n        >>> G.add_edge_list(sources,destinations,none)\n        \n        ";
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_3add_edge_list = {"add_edge_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_5Graph_2add_edge_list};
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_self = 0;
+  PyObject *__pyx_v_source_col = 0;
+  PyObject *__pyx_v_dest_col = 0;
+  PyObject *__pyx_v_value_col = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("add_edge_list (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_self,&__pyx_n_s_source_col,&__pyx_n_s_dest_col,&__pyx_n_s_value_col,0};
+    PyObject* values[4] = {0,0,0,0};
+    values[3] = ((PyObject *)((PyObject *)Py_None));
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_self)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_source_col)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, 1); __PYX_ERR(0, 66, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dest_col)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, 2); __PYX_ERR(0, 66, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_value_col);
+          if (value) { values[3] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "add_edge_list") < 0)) __PYX_ERR(0, 66, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_self = values[0];
+    __pyx_v_source_col = values[1];
+    __pyx_v_dest_col = values[2];
+    __pyx_v_value_col = values[3];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 66, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("bfs_wrapper.Graph.add_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(__pyx_self, __pyx_v_self, __pyx_v_source_col, __pyx_v_dest_col, __pyx_v_value_col);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_source_col, PyObject *__pyx_v_dest_col, PyObject *__pyx_v_value_col) {
+  uintptr_t __pyx_v_graph;
+  uintptr_t __pyx_v_source;
+  uintptr_t __pyx_v_dest;
+  uintptr_t __pyx_v_value;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  uintptr_t __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  __Pyx_RefNannySetupContext("add_edge_list", 0);
+
+  /* "bfs_wrapper.pyx":97
+ *         """
+ * 
+ *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t source=create_column(source_col)
+ *         cdef uintptr_t dest=create_column(dest_col)
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 97, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_graph = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":98
+ * 
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef uintptr_t source=create_column(source_col)             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t dest=create_column(dest_col)
+ *         cdef uintptr_t value
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_source_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 98, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 98, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_source = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":99
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef uintptr_t source=create_column(source_col)
+ *         cdef uintptr_t dest=create_column(dest_col)             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t value
+ *         if value_col is None:
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_dest_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 99, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 99, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_dest = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":101
+ *         cdef uintptr_t dest=create_column(dest_col)
+ *         cdef uintptr_t value
+ *         if value_col is None:             # <<<<<<<<<<<<<<
+ *             value = 0
+ *         else:
+ */
+  __pyx_t_3 = (__pyx_v_value_col == Py_None);
+  __pyx_t_4 = (__pyx_t_3 != 0);
+  if (__pyx_t_4) {
+
+    /* "bfs_wrapper.pyx":102
+ *         cdef uintptr_t value
+ *         if value_col is None:
+ *             value = 0             # <<<<<<<<<<<<<<
+ *         else:
+ *             value=create_column(value_col)
+ */
+    __pyx_v_value = 0;
+
+    /* "bfs_wrapper.pyx":101
+ *         cdef uintptr_t dest=create_column(dest_col)
+ *         cdef uintptr_t value
+ *         if value_col is None:             # <<<<<<<<<<<<<<
+ *             value = 0
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "bfs_wrapper.pyx":104
+ *             value = 0
+ *         else:
+ *             value=create_column(value_col)             # <<<<<<<<<<<<<<
+ * 
+ *         gdf_edge_list_view(<gdf_graph*>graph,
+ */
+  /*else*/ {
+    __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_value_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 104, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 104, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_value = __pyx_t_2;
+  }
+  __pyx_L3:;
+
+  /* "bfs_wrapper.pyx":106
+ *             value=create_column(value_col)
+ * 
+ *         gdf_edge_list_view(<gdf_graph*>graph,             # <<<<<<<<<<<<<<
+ *                        <gdf_column*>source,
+ *                        <gdf_column*>dest,
+ */
+  (void)(gdf_edge_list_view(((struct gdf_graph *)__pyx_v_graph), ((gdf_column *)__pyx_v_source), ((gdf_column *)__pyx_v_dest), ((gdf_column *)__pyx_v_value)));
+
+  /* "bfs_wrapper.pyx":66
+ * 
+ * 
+ *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("bfs_wrapper.Graph.add_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":111
+ *                        <gdf_column*>value)
+ * 
+ *     def view_edge_list(self):             # <<<<<<<<<<<<<<
+ *         ##TO DO
+ *         """
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
+static char __pyx_doc_11bfs_wrapper_5Graph_4view_edge_list[] = "\n        Display the edge list.\n        ";
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_5view_edge_list = {"view_edge_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list, METH_O, __pyx_doc_11bfs_wrapper_5Graph_4view_edge_list};
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list(PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("view_edge_list (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(__pyx_self, ((PyObject *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  uintptr_t __pyx_v_graph;
+  struct gdf_graph *__pyx_v_g;
+  gdf_size_type __pyx_v_size;
+  PyObject *__pyx_v_cffi_view = 0;
+  CYTHON_UNUSED PyObject *__pyx_v_data = NULL;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  uintptr_t __pyx_t_2;
+  gdf_size_type __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  __Pyx_RefNannySetupContext("view_edge_list", 0);
+
+  /* "bfs_wrapper.pyx":116
+ *         Display the edge list.
+ *         """
+ *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
+ *         cdef gdf_graph* g = <gdf_graph*>graph
+ *         size = g.edgeList.src_indices.size
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 116, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_graph = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":117
+ *         """
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef gdf_graph* g = <gdf_graph*>graph             # <<<<<<<<<<<<<<
+ *         size = g.edgeList.src_indices.size
+ *         print(size)
+ */
+  __pyx_v_g = ((struct gdf_graph *)__pyx_v_graph);
+
+  /* "bfs_wrapper.pyx":118
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef gdf_graph* g = <gdf_graph*>graph
+ *         size = g.edgeList.src_indices.size             # <<<<<<<<<<<<<<
+ *         print(size)
+ *         cdef object cffi_view = <object>g.edgeList.src_indices
+ */
+  __pyx_t_3 = __pyx_v_g->edgeList->src_indices->size;
+  __pyx_v_size = __pyx_t_3;
+
+  /* "bfs_wrapper.pyx":119
+ *         cdef gdf_graph* g = <gdf_graph*>graph
+ *         size = g.edgeList.src_indices.size
+ *         print(size)             # <<<<<<<<<<<<<<
+ *         cdef object cffi_view = <object>g.edgeList.src_indices
+ *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
+ */
+  __pyx_t_1 = __Pyx_PyInt_FromSize_t(__pyx_v_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 119, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_PrintOne(0, __pyx_t_1) < 0) __PYX_ERR(0, 119, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":120
+ *         size = g.edgeList.src_indices.size
+ *         print(size)
+ *         cdef object cffi_view = <object>g.edgeList.src_indices             # <<<<<<<<<<<<<<
+ *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
+ *         #return pygdf.Series(data)
+ */
+  __pyx_t_1 = ((PyObject *)__pyx_v_g->edgeList->src_indices);
+  __Pyx_INCREF(__pyx_t_1);
+  __pyx_v_cffi_view = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":121
+ *         print(size)
+ *         cdef object cffi_view = <object>g.edgeList.src_indices
+ *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)             # <<<<<<<<<<<<<<
+ *         #return pygdf.Series(data)
+ *         return 0
+ */
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 121, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_gdf); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 121, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_cffi_view_to_column_mem); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 121, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = NULL;
+  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
+    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
+    if (likely(__pyx_t_5)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+      __Pyx_INCREF(__pyx_t_5);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_4, function);
+    }
+  }
+  if (!__pyx_t_5) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_v_cffi_view); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_cffi_view};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_cffi_view};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+    } else
+    #endif
+    {
+      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 121, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5); __pyx_t_5 = NULL;
+      __Pyx_INCREF(__pyx_v_cffi_view);
+      __Pyx_GIVEREF(__pyx_v_cffi_view);
+      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_v_cffi_view);
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_data = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":123
+ *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
+ *         #return pygdf.Series(data)
+ *         return 0             # <<<<<<<<<<<<<<
+ * 
+ *     def add_adj_list(self, offsets_col, indices_col, value_col):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_int_0);
+  __pyx_r = __pyx_int_0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":111
+ *                        <gdf_column*>value)
+ * 
+ *     def view_edge_list(self):             # <<<<<<<<<<<<<<
+ *         ##TO DO
+ *         """
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_AddTraceback("bfs_wrapper.Graph.view_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_cffi_view);
+  __Pyx_XDECREF(__pyx_v_data);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":125
+ *         return 0
+ * 
+ *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_11bfs_wrapper_5Graph_6add_adj_list[] = "\n        Warp existing gdf columns representing an adjacency list in a gdf_graph.\n        ";
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_7add_adj_list = {"add_adj_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_5Graph_6add_adj_list};
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_self = 0;
+  PyObject *__pyx_v_offsets_col = 0;
+  PyObject *__pyx_v_indices_col = 0;
+  PyObject *__pyx_v_value_col = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("add_adj_list (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_self,&__pyx_n_s_offsets_col,&__pyx_n_s_indices_col,&__pyx_n_s_value_col,0};
+    PyObject* values[4] = {0,0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+        CYTHON_FALLTHROUGH;
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_self)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_offsets_col)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 1); __PYX_ERR(0, 125, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_indices_col)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 2); __PYX_ERR(0, 125, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  3:
+        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_value_col)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 3); __PYX_ERR(0, 125, __pyx_L3_error)
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "add_adj_list") < 0)) __PYX_ERR(0, 125, __pyx_L3_error)
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+    }
+    __pyx_v_self = values[0];
+    __pyx_v_offsets_col = values[1];
+    __pyx_v_indices_col = values[2];
+    __pyx_v_value_col = values[3];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 125, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("bfs_wrapper.Graph.add_adj_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(__pyx_self, __pyx_v_self, __pyx_v_offsets_col, __pyx_v_indices_col, __pyx_v_value_col);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_offsets_col, PyObject *__pyx_v_indices_col, PyObject *__pyx_v_value_col) {
+  uintptr_t __pyx_v_graph;
+  uintptr_t __pyx_v_offsets;
+  uintptr_t __pyx_v_indices;
+  uintptr_t __pyx_v_value;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  uintptr_t __pyx_t_2;
+  int __pyx_t_3;
+  int __pyx_t_4;
+  __Pyx_RefNannySetupContext("add_adj_list", 0);
+
+  /* "bfs_wrapper.pyx":130
+ *         """
+ *         ##TO TEST
+ *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t offsets=create_column(offsets_col)
+ *         cdef uintptr_t indices=create_column(indices_col)
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 130, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 130, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_graph = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":131
+ *         ##TO TEST
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef uintptr_t offsets=create_column(offsets_col)             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t indices=create_column(indices_col)
+ *         cdef uintptr_t value
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_offsets_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 131, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 131, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_offsets = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":132
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         cdef uintptr_t offsets=create_column(offsets_col)
+ *         cdef uintptr_t indices=create_column(indices_col)             # <<<<<<<<<<<<<<
+ *         cdef uintptr_t value
+ *         if value_col is None:
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_indices_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 132, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_indices = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":134
+ *         cdef uintptr_t indices=create_column(indices_col)
+ *         cdef uintptr_t value
+ *         if value_col is None:             # <<<<<<<<<<<<<<
+ *             value = 0
+ *         else:
+ */
+  __pyx_t_3 = (__pyx_v_value_col == Py_None);
+  __pyx_t_4 = (__pyx_t_3 != 0);
+  if (__pyx_t_4) {
+
+    /* "bfs_wrapper.pyx":135
+ *         cdef uintptr_t value
+ *         if value_col is None:
+ *             value = 0             # <<<<<<<<<<<<<<
+ *         else:
+ *             value=create_column(value_col)
+ */
+    __pyx_v_value = 0;
+
+    /* "bfs_wrapper.pyx":134
+ *         cdef uintptr_t indices=create_column(indices_col)
+ *         cdef uintptr_t value
+ *         if value_col is None:             # <<<<<<<<<<<<<<
+ *             value = 0
+ *         else:
+ */
+    goto __pyx_L3;
+  }
+
+  /* "bfs_wrapper.pyx":137
+ *             value = 0
+ *         else:
+ *             value=create_column(value_col)             # <<<<<<<<<<<<<<
+ * 
+ *         gdf_adj_list_view(<gdf_graph*>graph,
+ */
+  /*else*/ {
+    __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_value_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error)
+    __Pyx_GOTREF(__pyx_t_1);
+    __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 137, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+    __pyx_v_value = __pyx_t_2;
+  }
+  __pyx_L3:;
+
+  /* "bfs_wrapper.pyx":139
+ *             value=create_column(value_col)
+ * 
+ *         gdf_adj_list_view(<gdf_graph*>graph,             # <<<<<<<<<<<<<<
+ *                        <gdf_column*>offsets,
+ *                        <gdf_column*>indices,
+ */
+  (void)(gdf_adj_list_view(((struct gdf_graph *)__pyx_v_graph), ((gdf_column *)__pyx_v_offsets), ((gdf_column *)__pyx_v_indices), ((gdf_column *)__pyx_v_value)));
+
+  /* "bfs_wrapper.pyx":125
+ *         return 0
+ * 
+ *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("bfs_wrapper.Graph.add_adj_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":145
+ * 
+ * 
+ *     def add_transpose(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_9add_transpose(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
+static char __pyx_doc_11bfs_wrapper_5Graph_8add_transpose[] = "\n        Compute the transposed adjacency list from the edge list and add it to the existing graph.\n        ";
+static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_9add_transpose = {"add_transpose", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_9add_transpose, METH_O, __pyx_doc_11bfs_wrapper_5Graph_8add_transpose};
+static PyObject *__pyx_pw_11bfs_wrapper_5Graph_9add_transpose(PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("add_transpose (wrapper)", 0);
+  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_8add_transpose(__pyx_self, ((PyObject *)__pyx_v_self));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_5Graph_8add_transpose(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
+  uintptr_t __pyx_v_graph;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  uintptr_t __pyx_t_2;
+  __Pyx_RefNannySetupContext("add_transpose", 0);
+
+  /* "bfs_wrapper.pyx":149
+ *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
+ *         """
+ *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
+ *         gdf_add_transpose(<gdf_graph*>graph)
+ * 
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 149, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_graph = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":150
+ *         """
+ *         cdef uintptr_t graph = self.graph_ptr
+ *         gdf_add_transpose(<gdf_graph*>graph)             # <<<<<<<<<<<<<<
+ * 
+ * cpdef bfs(G, start, directed=True):
+ */
+  (void)(gdf_add_transpose(((struct gdf_graph *)__pyx_v_graph)));
+
+  /* "bfs_wrapper.pyx":145
+ * 
+ * 
+ *     def add_transpose(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
+ */
+
+  /* function exit code */
+  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("bfs_wrapper.Graph.add_transpose", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "bfs_wrapper.pyx":152
+ *         gdf_add_transpose(<gdf_graph*>graph)
+ * 
+ * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
+ *     """
+ *     Find the distances and predecessors for a breadth first traversal of a graph.
+ */
+
+static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyObject *__pyx_f_11bfs_wrapper_bfs(PyObject *__pyx_v_G, PyObject *__pyx_v_start, CYTHON_UNUSED int __pyx_skip_dispatch, struct __pyx_opt_args_11bfs_wrapper_bfs *__pyx_optional_args) {
+  PyObject *__pyx_v_directed = ((PyObject *)Py_True);
+  uintptr_t __pyx_v_graph;
+  struct gdf_graph *__pyx_v_g;
+  gdf_size_type __pyx_v_num_verts;
+  PyObject *__pyx_v_distances = NULL;
+  uintptr_t __pyx_v_distances_ptr;
+  PyObject *__pyx_v_predecessors = NULL;
+  uintptr_t __pyx_v_predecessors_ptr;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  uintptr_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  int __pyx_t_9;
+  bool __pyx_t_10;
+  __Pyx_RefNannySetupContext("bfs", 0);
+  if (__pyx_optional_args) {
+    if (__pyx_optional_args->__pyx_n > 0) {
+      __pyx_v_directed = __pyx_optional_args->directed;
+    }
+  }
+
+  /* "bfs_wrapper.pyx":184
+ *     """
+ * 
+ *     cdef uintptr_t graph = G.graph_ptr             # <<<<<<<<<<<<<<
+ *     cdef gdf_graph* g = <gdf_graph*>graph
+ *     num_verts = g.adjList.offsets.size - 1
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_G, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 184, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 184, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_graph = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":185
+ * 
+ *     cdef uintptr_t graph = G.graph_ptr
+ *     cdef gdf_graph* g = <gdf_graph*>graph             # <<<<<<<<<<<<<<
+ *     num_verts = g.adjList.offsets.size - 1
+ *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ */
+  __pyx_v_g = ((struct gdf_graph *)__pyx_v_graph);
+
+  /* "bfs_wrapper.pyx":186
+ *     cdef uintptr_t graph = G.graph_ptr
+ *     cdef gdf_graph* g = <gdf_graph*>graph
+ *     num_verts = g.adjList.offsets.size - 1             # <<<<<<<<<<<<<<
+ *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ *     cdef uintptr_t distances_ptr = create_column(distances)
+ */
+  __pyx_v_num_verts = (__pyx_v_g->adjList->offsets->size - 1);
+
+  /* "bfs_wrapper.pyx":187
+ *     cdef gdf_graph* g = <gdf_graph*>graph
+ *     num_verts = g.adjList.offsets.size - 1
+ *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))             # <<<<<<<<<<<<<<
+ *     cdef uintptr_t distances_ptr = create_column(distances)
+ *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ */
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_Series); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_zeros); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_3 = __Pyx_PyInt_FromSize_t(__pyx_v_num_verts); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_GIVEREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_3);
+  __pyx_t_3 = 0;
+  __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_int32); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_8) < 0) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+  __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 187, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __pyx_t_3 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) {
+    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_4);
+    if (likely(__pyx_t_3)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_4, function);
+    }
+  }
+  if (!__pyx_t_3) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_t_8};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_t_8};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_6);
+      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_3); __pyx_t_3 = NULL;
+      __Pyx_GIVEREF(__pyx_t_8);
+      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_t_8);
+      __pyx_t_8 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_v_distances = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":188
+ *     num_verts = g.adjList.offsets.size - 1
+ *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ *     cdef uintptr_t distances_ptr = create_column(distances)             # <<<<<<<<<<<<<<
+ *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ *     cdef uintptr_t predecessors_ptr = create_column(distances)
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_distances); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 188, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 188, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_distances_ptr = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":189
+ *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ *     cdef uintptr_t distances_ptr = create_column(distances)
+ *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))             # <<<<<<<<<<<<<<
+ *     cdef uintptr_t predecessors_ptr = create_column(distances)
+ * 
+ */
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_Series); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_6);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyInt_FromSize_t(__pyx_v_num_verts); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_GIVEREF(__pyx_t_4);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4);
+  __pyx_t_4 = 0;
+  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_int32); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_7) < 0) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_4); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  __pyx_t_4 = NULL;
+  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) {
+    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_6);
+    if (likely(__pyx_t_4)) {
+      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6);
+      __Pyx_INCREF(__pyx_t_4);
+      __Pyx_INCREF(function);
+      __Pyx_DECREF_SET(__pyx_t_6, function);
+    }
+  }
+  if (!__pyx_t_4) {
+    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
+    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    __Pyx_GOTREF(__pyx_t_1);
+  } else {
+    #if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(__pyx_t_6)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_7};
+      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    } else
+    #endif
+    #if CYTHON_FAST_PYCCALL
+    if (__Pyx_PyFastCFunction_Check(__pyx_t_6)) {
+      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_7};
+      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
+      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+    } else
+    #endif
+    {
+      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
+      __Pyx_GIVEREF(__pyx_t_7);
+      PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_t_7);
+      __pyx_t_7 = 0;
+      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
+      __Pyx_GOTREF(__pyx_t_1);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    }
+  }
+  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+  __pyx_v_predecessors = __pyx_t_1;
+  __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":190
+ *     cdef uintptr_t distances_ptr = create_column(distances)
+ *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+ *     cdef uintptr_t predecessors_ptr = create_column(distances)             # <<<<<<<<<<<<<<
+ * 
+ *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
+ */
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_distances); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 190, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 190, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_v_predecessors_ptr = __pyx_t_2;
+
+  /* "bfs_wrapper.pyx":192
+ *     cdef uintptr_t predecessors_ptr = create_column(distances)
+ * 
+ *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)             # <<<<<<<<<<<<<<
+ *     return distances, predecessors
+ */
+  __pyx_t_9 = __Pyx_PyInt_As_int(__pyx_v_start); if (unlikely((__pyx_t_9 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 192, __pyx_L1_error)
+  __pyx_t_10 = __Pyx_PyObject_IsTrue(__pyx_v_directed); if (unlikely((__pyx_t_10 == ((bool)-1)) && PyErr_Occurred())) __PYX_ERR(0, 192, __pyx_L1_error)
+  (void)(gdf_bfs(((struct gdf_graph *)__pyx_v_g), ((gdf_column *)__pyx_v_distances_ptr), ((gdf_column *)__pyx_v_predecessors_ptr), ((int)__pyx_t_9), ((bool)__pyx_t_10)));
+
+  /* "bfs_wrapper.pyx":193
+ * 
+ *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
+ *     return distances, predecessors             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 193, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_v_distances);
+  __Pyx_GIVEREF(__pyx_v_distances);
+  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_distances);
+  __Pyx_INCREF(__pyx_v_predecessors);
+  __Pyx_GIVEREF(__pyx_v_predecessors);
+  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_v_predecessors);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "bfs_wrapper.pyx":152
+ *         gdf_add_transpose(<gdf_graph*>graph)
+ * 
+ * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
+ *     """
+ *     Find the distances and predecessors for a breadth first traversal of a graph.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  __Pyx_XDECREF(__pyx_t_6);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XDECREF(__pyx_v_distances);
+  __Pyx_XDECREF(__pyx_v_predecessors);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* Python wrapper */
+static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static char __pyx_doc_11bfs_wrapper_6bfs[] = "\n    Find the distances and predecessors for a breadth first traversal of a graph.\n    \n    Parameters\n    ----------\n    G : cugraph.graph\n        cuGraph graph descriptor, should contain the connectivity information as an\n        adjacency list.\n    start : Integer\n        The index of the graph vertex from which the traversal begins\n    directed : bool\n        Indicates whether the graph in question is a directed graph, or whether\n        each edge has a corresponding reverse edge. (Allows optimizations if the\n        graph is undirected)\n    \n    Returns\n    -------\n    distances, predecessors : cudf.Series\n        distances gives the path distance for each vertex from the starting vertex\n        predecessors gives for each vertex the vertex it was reached from in the traversal\n        \n    Examples\n    --------\n    >>> M = ReadMtxFile(graph_file)\n    >>> sources = cudf.Series(M.row)\n    >>> destinations = cudf.Series(M.col)\n    >>> G = cuGraph.Graph()\n    >>> G.add_edge_list(sources,destinations,none)\n    >>> dist, pred = cuGraph.bfs(G, 0, false)\n    ";
+static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyObject *__pyx_v_G = 0;
+  PyObject *__pyx_v_start = 0;
+  PyObject *__pyx_v_directed = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("bfs (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_G,&__pyx_n_s_start,&__pyx_n_s_directed,0};
+    PyObject* values[3] = {0,0,0};
+    values[2] = ((PyObject *)Py_True);
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        CYTHON_FALLTHROUGH;
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        CYTHON_FALLTHROUGH;
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_G)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        CYTHON_FALLTHROUGH;
+        case  1:
+        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("bfs", 0, 2, 3, 1); __PYX_ERR(0, 152, __pyx_L3_error)
+        }
+        CYTHON_FALLTHROUGH;
+        case  2:
+        if (kw_args > 0) {
+          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_directed);
+          if (value) { values[2] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "bfs") < 0)) __PYX_ERR(0, 152, __pyx_L3_error)
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        CYTHON_FALLTHROUGH;
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_G = values[0];
+    __pyx_v_start = values[1];
+    __pyx_v_directed = values[2];
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("bfs", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 152, __pyx_L3_error)
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  __pyx_r = __pyx_pf_11bfs_wrapper_6bfs(__pyx_self, __pyx_v_G, __pyx_v_start, __pyx_v_directed);
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_11bfs_wrapper_6bfs(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_G, PyObject *__pyx_v_start, PyObject *__pyx_v_directed) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  struct __pyx_opt_args_11bfs_wrapper_bfs __pyx_t_2;
+  __Pyx_RefNannySetupContext("bfs", 0);
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_2.__pyx_n = 1;
+  __pyx_t_2.directed = __pyx_v_directed;
+  __pyx_t_1 = __pyx_f_11bfs_wrapper_bfs(__pyx_v_G, __pyx_v_start, 0, &__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {"bfs", (PyCFunction)__pyx_pw_11bfs_wrapper_7bfs, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_6bfs},
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
+static int __pyx_pymod_exec_bfs_wrapper(PyObject* module); /*proto*/
+static PyModuleDef_Slot __pyx_moduledef_slots[] = {
+  {Py_mod_create, (void*)__pyx_pymod_create},
+  {Py_mod_exec, (void*)__pyx_pymod_exec_bfs_wrapper},
+  {0, NULL}
+};
+#endif
+
+static struct PyModuleDef __pyx_moduledef = {
+    PyModuleDef_HEAD_INIT,
+    "bfs_wrapper",
+    0, /* m_doc */
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    0, /* m_size */
+  #else
+    -1, /* m_size */
+  #endif
+    __pyx_methods /* m_methods */,
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+    __pyx_moduledef_slots, /* m_slots */
+  #else
+    NULL, /* m_reload */
+  #endif
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_kp_u_Find_the_distances_and_predeces, __pyx_k_Find_the_distances_and_predeces, sizeof(__pyx_k_Find_the_distances_and_predeces), 0, 1, 0, 0},
+  {&__pyx_n_s_G, __pyx_k_G, sizeof(__pyx_k_G), 0, 0, 1, 1},
+  {&__pyx_n_s_Graph, __pyx_k_Graph, sizeof(__pyx_k_Graph), 0, 0, 1, 1},
+  {&__pyx_n_s_Graph___init, __pyx_k_Graph___init, sizeof(__pyx_k_Graph___init), 0, 0, 1, 1},
+  {&__pyx_kp_u_Graph___init___line_48, __pyx_k_Graph___init___line_48, sizeof(__pyx_k_Graph___init___line_48), 0, 1, 0, 0},
+  {&__pyx_n_s_Graph_add_adj_list, __pyx_k_Graph_add_adj_list, sizeof(__pyx_k_Graph_add_adj_list), 0, 0, 1, 1},
+  {&__pyx_n_s_Graph_add_edge_list, __pyx_k_Graph_add_edge_list, sizeof(__pyx_k_Graph_add_edge_list), 0, 0, 1, 1},
+  {&__pyx_kp_u_Graph_add_edge_list_line_66, __pyx_k_Graph_add_edge_list_line_66, sizeof(__pyx_k_Graph_add_edge_list_line_66), 0, 1, 0, 0},
+  {&__pyx_n_s_Graph_add_transpose, __pyx_k_Graph_add_transpose, sizeof(__pyx_k_Graph_add_transpose), 0, 0, 1, 1},
+  {&__pyx_n_s_Graph_view_edge_list, __pyx_k_Graph_view_edge_list, sizeof(__pyx_k_Graph_view_edge_list), 0, 0, 1, 1},
+  {&__pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa, __pyx_k_Returns_Graph_cuGraph_Graph_Exa, sizeof(__pyx_k_Returns_Graph_cuGraph_Graph_Exa), 0, 1, 0, 0},
+  {&__pyx_n_s_Series, __pyx_k_Series, sizeof(__pyx_k_Series), 0, 0, 1, 1},
+  {&__pyx_kp_u_Warp_existing_gdf_columns_repre, __pyx_k_Warp_existing_gdf_columns_repre, sizeof(__pyx_k_Warp_existing_gdf_columns_repre), 0, 1, 0, 0},
+  {&__pyx_n_s_add_adj_list, __pyx_k_add_adj_list, sizeof(__pyx_k_add_adj_list), 0, 0, 1, 1},
+  {&__pyx_n_s_add_edge_list, __pyx_k_add_edge_list, sizeof(__pyx_k_add_edge_list), 0, 0, 1, 1},
+  {&__pyx_n_s_add_transpose, __pyx_k_add_transpose, sizeof(__pyx_k_add_transpose), 0, 0, 1, 1},
+  {&__pyx_kp_u_bfs_line_152, __pyx_k_bfs_line_152, sizeof(__pyx_k_bfs_line_152), 0, 1, 0, 0},
+  {&__pyx_n_s_bfs_wrapper, __pyx_k_bfs_wrapper, sizeof(__pyx_k_bfs_wrapper), 0, 0, 1, 1},
+  {&__pyx_n_s_cffi_view, __pyx_k_cffi_view, sizeof(__pyx_k_cffi_view), 0, 0, 1, 1},
+  {&__pyx_n_s_cffi_view_to_column_mem, __pyx_k_cffi_view_to_column_mem, sizeof(__pyx_k_cffi_view_to_column_mem), 0, 0, 1, 1},
+  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
+  {&__pyx_n_s_column, __pyx_k_column, sizeof(__pyx_k_column), 0, 0, 1, 1},
+  {&__pyx_kp_s_cuGraph_graph_class_containing, __pyx_k_cuGraph_graph_class_containing, sizeof(__pyx_k_cuGraph_graph_class_containing), 0, 0, 1, 0},
+  {&__pyx_n_s_cudf, __pyx_k_cudf, sizeof(__pyx_k_cudf), 0, 0, 1, 1},
+  {&__pyx_n_s_data, __pyx_k_data, sizeof(__pyx_k_data), 0, 0, 1, 1},
+  {&__pyx_n_s_data_2, __pyx_k_data_2, sizeof(__pyx_k_data_2), 0, 0, 1, 1},
+  {&__pyx_n_s_dest, __pyx_k_dest, sizeof(__pyx_k_dest), 0, 0, 1, 1},
+  {&__pyx_n_s_dest_col, __pyx_k_dest_col, sizeof(__pyx_k_dest_col), 0, 0, 1, 1},
+  {&__pyx_n_s_device_ctypes_pointer, __pyx_k_device_ctypes_pointer, sizeof(__pyx_k_device_ctypes_pointer), 0, 0, 1, 1},
+  {&__pyx_n_s_directed, __pyx_k_directed, sizeof(__pyx_k_directed), 0, 0, 1, 1},
+  {&__pyx_n_s_doc, __pyx_k_doc, sizeof(__pyx_k_doc), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_dtypes, __pyx_k_dtypes, sizeof(__pyx_k_dtypes), 0, 0, 1, 1},
+  {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1},
+  {&__pyx_n_s_file, __pyx_k_file, sizeof(__pyx_k_file), 0, 0, 1, 1},
+  {&__pyx_n_s_float32, __pyx_k_float32, sizeof(__pyx_k_float32), 0, 0, 1, 1},
+  {&__pyx_n_s_float64, __pyx_k_float64, sizeof(__pyx_k_float64), 0, 0, 1, 1},
+  {&__pyx_n_s_g, __pyx_k_g, sizeof(__pyx_k_g), 0, 0, 1, 1},
+  {&__pyx_n_s_gdf, __pyx_k_gdf, sizeof(__pyx_k_gdf), 0, 0, 1, 1},
+  {&__pyx_n_s_get_column_data_ptr, __pyx_k_get_column_data_ptr, sizeof(__pyx_k_get_column_data_ptr), 0, 0, 1, 1},
+  {&__pyx_n_s_get_column_valid_ptr, __pyx_k_get_column_valid_ptr, sizeof(__pyx_k_get_column_valid_ptr), 0, 0, 1, 1},
+  {&__pyx_n_s_get_ctype_ptr, __pyx_k_get_ctype_ptr, sizeof(__pyx_k_get_ctype_ptr), 0, 0, 1, 1},
+  {&__pyx_n_s_graph, __pyx_k_graph, sizeof(__pyx_k_graph), 0, 0, 1, 1},
+  {&__pyx_n_s_graph_ptr, __pyx_k_graph_ptr, sizeof(__pyx_k_graph_ptr), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_n_s_indices, __pyx_k_indices, sizeof(__pyx_k_indices), 0, 0, 1, 1},
+  {&__pyx_n_s_indices_col, __pyx_k_indices_col, sizeof(__pyx_k_indices_col), 0, 0, 1, 1},
+  {&__pyx_n_s_init, __pyx_k_init, sizeof(__pyx_k_init), 0, 0, 1, 1},
+  {&__pyx_n_s_int32, __pyx_k_int32, sizeof(__pyx_k_int32), 0, 0, 1, 1},
+  {&__pyx_n_s_int64, __pyx_k_int64, sizeof(__pyx_k_int64), 0, 0, 1, 1},
+  {&__pyx_n_s_librmm, __pyx_k_librmm, sizeof(__pyx_k_librmm), 0, 0, 1, 1},
+  {&__pyx_n_s_librmm_cffi, __pyx_k_librmm_cffi, sizeof(__pyx_k_librmm_cffi), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_n_s_mask, __pyx_k_mask, sizeof(__pyx_k_mask), 0, 0, 1, 1},
+  {&__pyx_n_s_metaclass, __pyx_k_metaclass, sizeof(__pyx_k_metaclass), 0, 0, 1, 1},
+  {&__pyx_n_s_module, __pyx_k_module, sizeof(__pyx_k_module), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_null_count, __pyx_k_null_count, sizeof(__pyx_k_null_count), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
+  {&__pyx_n_s_offsets, __pyx_k_offsets, sizeof(__pyx_k_offsets), 0, 0, 1, 1},
+  {&__pyx_n_s_offsets_col, __pyx_k_offsets_col, sizeof(__pyx_k_offsets_col), 0, 0, 1, 1},
+  {&__pyx_n_s_prepare, __pyx_k_prepare, sizeof(__pyx_k_prepare), 0, 0, 1, 1},
+  {&__pyx_n_s_print, __pyx_k_print, sizeof(__pyx_k_print), 0, 0, 1, 1},
+  {&__pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_k_python_bfs_bfs_wrapper_pyx, sizeof(__pyx_k_python_bfs_bfs_wrapper_pyx), 0, 0, 1, 0},
+  {&__pyx_n_s_qualname, __pyx_k_qualname, sizeof(__pyx_k_qualname), 0, 0, 1, 1},
+  {&__pyx_n_s_rmm, __pyx_k_rmm, sizeof(__pyx_k_rmm), 0, 0, 1, 1},
+  {&__pyx_n_s_self, __pyx_k_self, sizeof(__pyx_k_self), 0, 0, 1, 1},
+  {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
+  {&__pyx_n_s_source, __pyx_k_source, sizeof(__pyx_k_source), 0, 0, 1, 1},
+  {&__pyx_n_s_source_col, __pyx_k_source_col, sizeof(__pyx_k_source_col), 0, 0, 1, 1},
+  {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_to_gpu_array, __pyx_k_to_gpu_array, sizeof(__pyx_k_to_gpu_array), 0, 0, 1, 1},
+  {&__pyx_n_s_type, __pyx_k_type, sizeof(__pyx_k_type), 0, 0, 1, 1},
+  {&__pyx_n_s_value, __pyx_k_value, sizeof(__pyx_k_value), 0, 0, 1, 1},
+  {&__pyx_n_s_value_col, __pyx_k_value_col, sizeof(__pyx_k_value_col), 0, 0, 1, 1},
+  {&__pyx_n_s_view_edge_list, __pyx_k_view_edge_list, sizeof(__pyx_k_view_edge_list), 0, 0, 1, 1},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  return 0;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "bfs_wrapper.pyx":12
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+ * 
+ * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
+ *     # The manner to access the pointers in the gdf's might change, so
+ *     # encapsulating access in the following 3 methods. They might also be
+ */
+  __pyx_tuple_ = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple_);
+  __Pyx_GIVEREF(__pyx_tuple_);
+  __pyx_codeobj__2 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple_, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_ctype_ptr, 12, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__2)) __PYX_ERR(0, 12, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":18
+ *     return obj.device_ctypes_pointer.value
+ * 
+ * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ */
+  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__3);
+  __Pyx_GIVEREF(__pyx_tuple__3);
+  __pyx_codeobj__4 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__3, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_column_data_ptr, 18, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__4)) __PYX_ERR(0, 18, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":21
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
+ * 
+ */
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 21, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+  __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__5, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_column_valid_ptr, 21, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 21, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":48
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ *     """
+ *     def __init__(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Returns
+ */
+  __pyx_tuple__7 = PyTuple_Pack(3, __pyx_n_s_self, __pyx_n_s_graph, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 48, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+  __pyx_codeobj__8 = (PyObject*)__Pyx_PyCode_New(1, 0, 3, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__7, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_init, 48, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__8)) __PYX_ERR(0, 48, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":66
+ * 
+ * 
+ *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
+ */
+  __pyx_tuple__9 = PyTuple_Pack(8, __pyx_n_s_self, __pyx_n_s_source_col, __pyx_n_s_dest_col, __pyx_n_s_value_col, __pyx_n_s_graph, __pyx_n_s_source, __pyx_n_s_dest, __pyx_n_s_value); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 66, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+  __pyx_codeobj__10 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__9, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_edge_list, 66, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__10)) __PYX_ERR(0, 66, __pyx_L1_error)
+  __pyx_tuple__11 = PyTuple_Pack(1, ((PyObject *)Py_None)); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 66, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+
+  /* "bfs_wrapper.pyx":111
+ *                        <gdf_column*>value)
+ * 
+ *     def view_edge_list(self):             # <<<<<<<<<<<<<<
+ *         ##TO DO
+ *         """
+ */
+  __pyx_tuple__12 = PyTuple_Pack(6, __pyx_n_s_self, __pyx_n_s_graph, __pyx_n_s_g, __pyx_n_s_size, __pyx_n_s_cffi_view, __pyx_n_s_data_2); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(0, 111, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
+  __pyx_codeobj__13 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__12, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_view_edge_list, 111, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__13)) __PYX_ERR(0, 111, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":125
+ *         return 0
+ * 
+ *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
+ */
+  __pyx_tuple__14 = PyTuple_Pack(8, __pyx_n_s_self, __pyx_n_s_offsets_col, __pyx_n_s_indices_col, __pyx_n_s_value_col, __pyx_n_s_graph, __pyx_n_s_offsets, __pyx_n_s_indices, __pyx_n_s_value); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 125, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+  __pyx_codeobj__15 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__14, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_adj_list, 125, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__15)) __PYX_ERR(0, 125, __pyx_L1_error)
+
+  /* "bfs_wrapper.pyx":145
+ * 
+ * 
+ *     def add_transpose(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
+ */
+  __pyx_tuple__16 = PyTuple_Pack(2, __pyx_n_s_self, __pyx_n_s_graph); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+  __pyx_codeobj__17 = (PyObject*)__Pyx_PyCode_New(1, 0, 2, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__16, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_transpose, 145, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__17)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_modinit_global_init_code(void); /*proto*/
+static int __Pyx_modinit_variable_export_code(void); /*proto*/
+static int __Pyx_modinit_function_export_code(void); /*proto*/
+static int __Pyx_modinit_type_init_code(void); /*proto*/
+static int __Pyx_modinit_type_import_code(void); /*proto*/
+static int __Pyx_modinit_variable_import_code(void); /*proto*/
+static int __Pyx_modinit_function_import_code(void); /*proto*/
+
+static int __Pyx_modinit_global_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
+  /*--- Global init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
+  /*--- Variable export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_export_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
+  /*--- Function export code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_init_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
+  /*--- Type init code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_type_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
+  /*--- Type import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_variable_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
+  /*--- Variable import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+static int __Pyx_modinit_function_import_code(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
+  /*--- Function import code ---*/
+  __Pyx_RefNannyFinishContext();
+  return 0;
+}
+
+
+#if PY_MAJOR_VERSION < 3
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC void
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#else
+#ifdef CYTHON_NO_PYINIT_EXPORT
+#define __Pyx_PyMODINIT_FUNC PyObject *
+#else
+#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
+#endif
+#endif
+#ifndef CYTHON_SMALL_CODE
+#if defined(__clang__)
+    #define CYTHON_SMALL_CODE
+#elif defined(__GNUC__) && (!(defined(__cplusplus)) || (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4)))
+    #define CYTHON_SMALL_CODE __attribute__((cold))
+#else
+    #define CYTHON_SMALL_CODE
+#endif
+#endif
+
+
+#if PY_MAJOR_VERSION < 3
+__Pyx_PyMODINIT_FUNC initbfs_wrapper(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC initbfs_wrapper(void)
+#else
+__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void) CYTHON_SMALL_CODE; /*proto*/
+__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void)
+#if CYTHON_PEP489_MULTI_PHASE_INIT
+{
+  return PyModuleDef_Init(&__pyx_moduledef);
+}
+static int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name) {
+    PyObject *value = PyObject_GetAttrString(spec, from_name);
+    int result = 0;
+    if (likely(value)) {
+        result = PyDict_SetItemString(moddict, to_name, value);
+        Py_DECREF(value);
+    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+    } else {
+        result = -1;
+    }
+    return result;
+}
+static PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
+    PyObject *module = NULL, *moddict, *modname;
+    if (__pyx_m)
+        return __Pyx_NewRef(__pyx_m);
+    modname = PyObject_GetAttrString(spec, "name");
+    if (unlikely(!modname)) goto bad;
+    module = PyModule_NewObject(modname);
+    Py_DECREF(modname);
+    if (unlikely(!module)) goto bad;
+    moddict = PyModule_GetDict(module);
+    if (unlikely(!moddict)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__") < 0)) goto bad;
+    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__") < 0)) goto bad;
+    return module;
+bad:
+    Py_XDECREF(module);
+    return NULL;
+}
+
+
+static int __pyx_pymod_exec_bfs_wrapper(PyObject *__pyx_pyinit_module)
+#endif
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  if (__pyx_m && __pyx_m == __pyx_pyinit_module) return 0;
+  #elif PY_MAJOR_VERSION >= 3
+  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
+  #endif
+  #if CYTHON_REFNANNY
+__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+if (!__Pyx_RefNanny) {
+  PyErr_Clear();
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+  if (!__Pyx_RefNanny)
+      Py_FatalError("failed to import 'refnanny' module");
+}
+#endif
+  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void)", 0);
+  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #ifdef __Pyx_CyFunction_USED
+  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Coroutine_USED
+  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_AsyncGen_USED
+  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  #ifdef __Pyx_StopAsyncIteration_USED
+  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  __pyx_m = __pyx_pyinit_module;
+  Py_INCREF(__pyx_m);
+  #else
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4("bfs_wrapper", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
+  /*--- Initialize various global constants etc. ---*/
+  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+  if (__pyx_module_is_main_bfs_wrapper) {
+    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
+    if (!PyDict_GetItemString(modules, "bfs_wrapper")) {
+      if (unlikely(PyDict_SetItemString(modules, "bfs_wrapper", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Constants init code ---*/
+  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  /*--- Global type/function init code ---*/
+  (void)__Pyx_modinit_global_init_code();
+  (void)__Pyx_modinit_variable_export_code();
+  (void)__Pyx_modinit_function_export_code();
+  (void)__Pyx_modinit_type_init_code();
+  (void)__Pyx_modinit_type_import_code();
+  (void)__Pyx_modinit_variable_import_code();
+  (void)__Pyx_modinit_function_import_code();
+  /*--- Execution code ---*/
+  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
+  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  #endif
+
+  /* "bfs_wrapper.pyx":5
+ * from libc.stdint cimport uintptr_t
+ * from libc.stdlib cimport calloc, malloc, free
+ * import cudf             # <<<<<<<<<<<<<<
+ * from librmm_cffi import librmm as rmm
+ * #from pygdf import Column
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_cudf, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cudf, __pyx_t_1) < 0) __PYX_ERR(0, 5, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":6
+ * from libc.stdlib cimport calloc, malloc, free
+ * import cudf
+ * from librmm_cffi import librmm as rmm             # <<<<<<<<<<<<<<
+ * #from pygdf import Column
+ * import numpy as np
+ */
+  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_INCREF(__pyx_n_s_librmm);
+  __Pyx_GIVEREF(__pyx_n_s_librmm);
+  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_librmm);
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_librmm_cffi, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_librmm); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_rmm, __pyx_t_1) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":8
+ * from librmm_cffi import librmm as rmm
+ * #from pygdf import Column
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * 
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+ */
+  __pyx_t_2 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_2) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":10
+ * import numpy as np
+ * 
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}             # <<<<<<<<<<<<<<
+ * 
+ * def _get_ctype_ptr(obj):
+ */
+  __pyx_t_2 = __Pyx_PyDict_NewPresized(4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_int32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_INT32); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_int64); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_INT64); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_float32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_FLOAT32); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_float64); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_3);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_FLOAT64); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_dtypes, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":12
+ * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+ * 
+ * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
+ *     # The manner to access the pointers in the gdf's might change, so
+ *     # encapsulating access in the following 3 methods. They might also be
+ */
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_1_get_ctype_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_ctype_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":18
+ *     return obj.device_ctypes_pointer.value
+ * 
+ * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ */
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_3_get_column_data_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_column_data_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 18, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":21
+ *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
+ * 
+ * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
+ *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
+ * 
+ */
+  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5_get_column_valid_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 21, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_column_valid_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 21, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":44
+ *     return col_ptr
+ * 
+ * class Graph:             # <<<<<<<<<<<<<<
+ *     """
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ */
+  __pyx_t_2 = __Pyx_Py3MetaclassPrepare((PyObject *) NULL, __pyx_empty_tuple, __pyx_n_s_Graph, __pyx_n_s_Graph, (PyObject *) NULL, __pyx_n_s_bfs_wrapper, __pyx_kp_s_cuGraph_graph_class_containing); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+
+  /* "bfs_wrapper.pyx":48
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ *     """
+ *     def __init__(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Returns
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_1__init__, 0, __pyx_n_s_Graph___init, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__8)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 48, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_init, __pyx_t_1) < 0) __PYX_ERR(0, 48, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":66
+ * 
+ * 
+ *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_3add_edge_list, 0, __pyx_n_s_Graph_add_edge_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__10)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_CyFunction_SetDefaultsTuple(__pyx_t_1, __pyx_tuple__11);
+  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_edge_list, __pyx_t_1) < 0) __PYX_ERR(0, 66, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":111
+ *                        <gdf_column*>value)
+ * 
+ *     def view_edge_list(self):             # <<<<<<<<<<<<<<
+ *         ##TO DO
+ *         """
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_5view_edge_list, 0, __pyx_n_s_Graph_view_edge_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__13)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 111, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_view_edge_list, __pyx_t_1) < 0) __PYX_ERR(0, 111, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":125
+ *         return 0
+ * 
+ *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
+ *         """
+ *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_7add_adj_list, 0, __pyx_n_s_Graph_add_adj_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__15)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_adj_list, __pyx_t_1) < 0) __PYX_ERR(0, 125, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":145
+ * 
+ * 
+ *     def add_transpose(self):             # <<<<<<<<<<<<<<
+ *         """
+ *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
+ */
+  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_9add_transpose, 0, __pyx_n_s_Graph_add_transpose, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__17)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_transpose, __pyx_t_1) < 0) __PYX_ERR(0, 145, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "bfs_wrapper.pyx":44
+ *     return col_ptr
+ * 
+ * class Graph:             # <<<<<<<<<<<<<<
+ *     """
+ *         cuGraph graph class containing basic graph creation and transformation operations.
+ */
+  __pyx_t_1 = __Pyx_Py3ClassCreate(((PyObject*)&__Pyx_DefaultClassType), __pyx_n_s_Graph, __pyx_empty_tuple, __pyx_t_2, NULL, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_Graph, __pyx_t_1) < 0) __PYX_ERR(0, 44, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /* "bfs_wrapper.pyx":1
+ * from c_bfs cimport *             # <<<<<<<<<<<<<<
+ * from libcpp cimport bool
+ * from libc.stdint cimport uintptr_t
+ */
+  __pyx_t_2 = __Pyx_PyDict_NewPresized(3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_GOTREF(__pyx_t_2);
+  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_Graph___init___line_48, __pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_Graph_add_edge_list_line_66, __pyx_kp_u_Warp_existing_gdf_columns_repre) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_bfs_line_152, __pyx_kp_u_Find_the_distances_and_predeces) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+
+  /*--- Wrapped vars code ---*/
+
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  if (__pyx_m) {
+    if (__pyx_d) {
+      __Pyx_AddTraceback("init bfs_wrapper", 0, __pyx_lineno, __pyx_filename);
+    }
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init bfs_wrapper");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if CYTHON_PEP489_MULTI_PHASE_INIT
+  return (__pyx_m != NULL) ? 0 : -1;
+  #elif PY_MAJOR_VERSION >= 3
+  return __pyx_m;
+  #else
+  return;
+  #endif
+}
+
+/* --- Runtime support code --- */
+/* Refnanny */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif
+
+/* PyObjectGetAttrStr */
+#if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#endif
+
+/* GetBuiltinName */
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+/* GetModuleGlobalName */
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if !CYTHON_AVOID_BORROWED_REFS
+#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
+    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else if (unlikely(PyErr_Occurred())) {
+        result = NULL;
+    } else {
+#else
+    result = PyDict_GetItem(__pyx_d, name);
+    if (likely(result)) {
+        Py_INCREF(result);
+    } else {
+#endif
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
+/* PyCFunctionFastCall */
+    #if CYTHON_FAST_PYCCALL
+static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) {
+    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
+    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
+    PyObject *self = PyCFunction_GET_SELF(func);
+    int flags = PyCFunction_GET_FLAGS(func);
+    assert(PyCFunction_Check(func));
+    assert(METH_FASTCALL == (flags & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)));
+    assert(nargs >= 0);
+    assert(nargs == 0 || args != NULL);
+    /* _PyCFunction_FastCallDict() must not be called with an exception set,
+       because it may clear it (directly or indirectly) and so the
+       caller loses its exception */
+    assert(!PyErr_Occurred());
+    if ((PY_VERSION_HEX < 0x030700A0) || unlikely(flags & METH_KEYWORDS)) {
+        return (*((__Pyx_PyCFunctionFastWithKeywords)meth)) (self, args, nargs, NULL);
+    } else {
+        return (*((__Pyx_PyCFunctionFast)meth)) (self, args, nargs);
+    }
+}
+#endif
+
+/* PyFunctionFastCall */
+    #if CYTHON_FAST_PYCALL
+#include "frameobject.h"
+static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
+                                               PyObject *globals) {
+    PyFrameObject *f;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    PyObject **fastlocals;
+    Py_ssize_t i;
+    PyObject *result;
+    assert(globals != NULL);
+    /* XXX Perhaps we should create a specialized
+       PyFrame_New() that doesn't take locals, but does
+       take builtins without sanity checking them.
+       */
+    assert(tstate != NULL);
+    f = PyFrame_New(tstate, co, globals, NULL);
+    if (f == NULL) {
+        return NULL;
+    }
+    fastlocals = f->f_localsplus;
+    for (i = 0; i < na; i++) {
+        Py_INCREF(*args);
+        fastlocals[i] = *args++;
+    }
+    result = PyEval_EvalFrameEx(f,0);
+    ++tstate->recursion_depth;
+    Py_DECREF(f);
+    --tstate->recursion_depth;
+    return result;
+}
+#if 1 || PY_VERSION_HEX < 0x030600B1
+static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs) {
+    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
+    PyObject *globals = PyFunction_GET_GLOBALS(func);
+    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
+    PyObject *closure;
+#if PY_MAJOR_VERSION >= 3
+    PyObject *kwdefs;
+#endif
+    PyObject *kwtuple, **k;
+    PyObject **d;
+    Py_ssize_t nd;
+    Py_ssize_t nk;
+    PyObject *result;
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    nk = kwargs ? PyDict_Size(kwargs) : 0;
+    if (Py_EnterRecursiveCall((char*)" while calling a Python object")) {
+        return NULL;
+    }
+    if (
+#if PY_MAJOR_VERSION >= 3
+            co->co_kwonlyargcount == 0 &&
+#endif
+            likely(kwargs == NULL || nk == 0) &&
+            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
+        if (argdefs == NULL && co->co_argcount == nargs) {
+            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
+            goto done;
+        }
+        else if (nargs == 0 && argdefs != NULL
+                 && co->co_argcount == Py_SIZE(argdefs)) {
+            /* function called with no arguments, but all parameters have
+               a default value: use default values as arguments .*/
+            args = &PyTuple_GET_ITEM(argdefs, 0);
+            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
+            goto done;
+        }
+    }
+    if (kwargs != NULL) {
+        Py_ssize_t pos, i;
+        kwtuple = PyTuple_New(2 * nk);
+        if (kwtuple == NULL) {
+            result = NULL;
+            goto done;
+        }
+        k = &PyTuple_GET_ITEM(kwtuple, 0);
+        pos = i = 0;
+        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
+            Py_INCREF(k[i]);
+            Py_INCREF(k[i+1]);
+            i += 2;
+        }
+        nk = i / 2;
+    }
+    else {
+        kwtuple = NULL;
+        k = NULL;
+    }
+    closure = PyFunction_GET_CLOSURE(func);
+#if PY_MAJOR_VERSION >= 3
+    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
+#endif
+    if (argdefs != NULL) {
+        d = &PyTuple_GET_ITEM(argdefs, 0);
+        nd = Py_SIZE(argdefs);
+    }
+    else {
+        d = NULL;
+        nd = 0;
+    }
+#if PY_MAJOR_VERSION >= 3
+    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, kwdefs, closure);
+#else
+    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
+                               args, nargs,
+                               k, (int)nk,
+                               d, (int)nd, closure);
+#endif
+    Py_XDECREF(kwtuple);
+done:
+    Py_LeaveRecursiveCall();
+    return result;
+}
+#endif
+#endif
+
+/* PyObjectCall */
+    #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = (*call)(func, arg, kw);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallMethO */
+    #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
+    PyObject *self, *result;
+    PyCFunction cfunc;
+    cfunc = PyCFunction_GET_FUNCTION(func);
+    self = PyCFunction_GET_SELF(func);
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+    result = cfunc(self, arg);
+    Py_LeaveRecursiveCall();
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+/* PyObjectCallOneArg */
+    #if CYTHON_COMPILING_IN_CPYTHON
+static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_New(1);
+    if (unlikely(!args)) return NULL;
+    Py_INCREF(arg);
+    PyTuple_SET_ITEM(args, 0, arg);
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+#if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCall(func, &arg, 1);
+    }
+#endif
+    if (likely(PyCFunction_Check(func))) {
+        if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) {
+            return __Pyx_PyObject_CallMethO(func, arg);
+#if CYTHON_FAST_PYCCALL
+        } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) {
+            return __Pyx_PyCFunction_FastCall(func, &arg, 1);
+#endif
+        }
+    }
+    return __Pyx__PyObject_CallOneArg(func, arg);
+}
+#else
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
+    PyObject *result;
+    PyObject *args = PyTuple_Pack(1, arg);
+    if (unlikely(!args)) return NULL;
+    result = __Pyx_PyObject_Call(func, args, NULL);
+    Py_DECREF(args);
+    return result;
+}
+#endif
+
+/* PyObjectCallNoArg */
+    #if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) {
+#if CYTHON_FAST_PYCALL
+    if (PyFunction_Check(func)) {
+        return __Pyx_PyFunction_FastCall(func, NULL, 0);
+    }
+#endif
+#ifdef __Pyx_CyFunction_USED
+    if (likely(PyCFunction_Check(func) || __Pyx_TypeCheck(func, __pyx_CyFunctionType))) {
+#else
+    if (likely(PyCFunction_Check(func))) {
+#endif
+        if (likely(PyCFunction_GET_FLAGS(func) & METH_NOARGS)) {
+            return __Pyx_PyObject_CallMethO(func, NULL);
+        }
+    }
+    return __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL);
+}
+#endif
+
+/* GetItemInt */
+      static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
+    PyObject *r;
+    if (!j) return NULL;
+    r = PyObject_GetItem(o, j);
+    Py_DECREF(j);
+    return r;
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyList_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyList_GET_SIZE(o)))) {
+        PyObject *r = PyList_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
+                                                              CYTHON_NCP_UNUSED int wraparound,
+                                                              CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    Py_ssize_t wrapped_i = i;
+    if (wraparound & unlikely(i < 0)) {
+        wrapped_i += PyTuple_GET_SIZE(o);
+    }
+    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyTuple_GET_SIZE(o)))) {
+        PyObject *r = PyTuple_GET_ITEM(o, wrapped_i);
+        Py_INCREF(r);
+        return r;
+    }
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+#else
+    return PySequence_GetItem(o, i);
+#endif
+}
+static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
+                                                     CYTHON_NCP_UNUSED int wraparound,
+                                                     CYTHON_NCP_UNUSED int boundscheck) {
+#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
+    if (is_list || PyList_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
+        if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) {
+            PyObject *r = PyList_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    }
+    else if (PyTuple_CheckExact(o)) {
+        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
+        if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
+            PyObject *r = PyTuple_GET_ITEM(o, n);
+            Py_INCREF(r);
+            return r;
+        }
+    } else {
+        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
+        if (likely(m && m->sq_item)) {
+            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
+                Py_ssize_t l = m->sq_length(o);
+                if (likely(l >= 0)) {
+                    i += l;
+                } else {
+                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
+                        return NULL;
+                    PyErr_Clear();
+                }
+            }
+            return m->sq_item(o, i);
+        }
+    }
+#else
+    if (is_list || PySequence_Check(o)) {
+        return PySequence_GetItem(o, i);
+    }
+#endif
+    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
+}
+
+/* ObjectGetItem */
+      #if CYTHON_USE_TYPE_SLOTS
+static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject* index) {
+    PyObject *runerr;
+    Py_ssize_t key_value;
+    PySequenceMethods *m = Py_TYPE(obj)->tp_as_sequence;
+    if (unlikely(!(m && m->sq_item))) {
+        PyErr_Format(PyExc_TypeError, "'%.200s' object is not subscriptable", Py_TYPE(obj)->tp_name);
+        return NULL;
+    }
+    key_value = __Pyx_PyIndex_AsSsize_t(index);
+    if (likely(key_value != -1 || !(runerr = PyErr_Occurred()))) {
+        return __Pyx_GetItemInt_Fast(obj, key_value, 0, 1, 1);
+    }
+    if (PyErr_GivenExceptionMatches(runerr, PyExc_OverflowError)) {
+        PyErr_Clear();
+        PyErr_Format(PyExc_IndexError, "cannot fit '%.200s' into an index-sized integer", Py_TYPE(index)->tp_name);
+    }
+    return NULL;
+}
+static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key) {
+    PyMappingMethods *m = Py_TYPE(obj)->tp_as_mapping;
+    if (likely(m && m->mp_subscript)) {
+        return m->mp_subscript(obj, key);
+    }
+    return __Pyx_PyObject_GetIndex(obj, key);
+}
+#endif
+
+/* PyObjectSetAttrStr */
+      #if CYTHON_USE_TYPE_SLOTS
+static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_setattro))
+        return tp->tp_setattro(obj, attr_name, value);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_setattr))
+        return tp->tp_setattr(obj, PyString_AS_STRING(attr_name), value);
+#endif
+    return PyObject_SetAttr(obj, attr_name, value);
+}
+#endif
+
+/* RaiseArgTupleInvalid */
+      static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+/* RaiseDoubleKeywords */
+      static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+/* ParseKeywords */
+      static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+/* Import */
+      static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_MAJOR_VERSION < 3
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0;
+        }
+        #endif
+        if (!module) {
+            #if PY_MAJOR_VERSION < 3
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+bad:
+    #if PY_MAJOR_VERSION < 3
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+/* ImportFrom */
+      static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
+    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
+    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Format(PyExc_ImportError,
+        #if PY_MAJOR_VERSION < 3
+            "cannot import name %.230s", PyString_AS_STRING(name));
+        #else
+            "cannot import name %S", name);
+        #endif
+    }
+    return value;
+}
+
+/* FetchCommonType */
+      static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
+    PyObject* fake_module;
+    PyTypeObject* cached_type = NULL;
+    fake_module = PyImport_AddModule((char*) "_cython_" CYTHON_ABI);
+    if (!fake_module) return NULL;
+    Py_INCREF(fake_module);
+    cached_type = (PyTypeObject*) PyObject_GetAttrString(fake_module, type->tp_name);
+    if (cached_type) {
+        if (!PyType_Check((PyObject*)cached_type)) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s is not a type object",
+                type->tp_name);
+            goto bad;
+        }
+        if (cached_type->tp_basicsize != type->tp_basicsize) {
+            PyErr_Format(PyExc_TypeError,
+                "Shared Cython type %.200s has the wrong size, try recompiling",
+                type->tp_name);
+            goto bad;
+        }
+    } else {
+        if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
+        PyErr_Clear();
+        if (PyType_Ready(type) < 0) goto bad;
+        if (PyObject_SetAttrString(fake_module, type->tp_name, (PyObject*) type) < 0)
+            goto bad;
+        Py_INCREF(type);
+        cached_type = type;
+    }
+done:
+    Py_DECREF(fake_module);
+    return cached_type;
+bad:
+    Py_XDECREF(cached_type);
+    cached_type = NULL;
+    goto done;
+}
+
+/* CythonFunction */
+      #include <structmember.h>
+static PyObject *
+__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, CYTHON_UNUSED void *closure)
+{
+    if (unlikely(op->func_doc == NULL)) {
+        if (op->func.m_ml->ml_doc) {
+#if PY_MAJOR_VERSION >= 3
+            op->func_doc = PyUnicode_FromString(op->func.m_ml->ml_doc);
+#else
+            op->func_doc = PyString_FromString(op->func.m_ml->ml_doc);
+#endif
+            if (unlikely(op->func_doc == NULL))
+                return NULL;
+        } else {
+            Py_INCREF(Py_None);
+            return Py_None;
+        }
+    }
+    Py_INCREF(op->func_doc);
+    return op->func_doc;
+}
+static int
+__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp = op->func_doc;
+    if (value == NULL) {
+        value = Py_None;
+    }
+    Py_INCREF(value);
+    op->func_doc = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_name == NULL)) {
+#if PY_MAJOR_VERSION >= 3
+        op->func_name = PyUnicode_InternFromString(op->func.m_ml->ml_name);
+#else
+        op->func_name = PyString_InternFromString(op->func.m_ml->ml_name);
+#endif
+        if (unlikely(op->func_name == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_name);
+    return op->func_name;
+}
+static int
+__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__name__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_name;
+    Py_INCREF(value);
+    op->func_name = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_qualname);
+    return op->func_qualname;
+}
+static int
+__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+#if PY_MAJOR_VERSION >= 3
+    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
+#else
+    if (unlikely(value == NULL || !PyString_Check(value))) {
+#endif
+        PyErr_SetString(PyExc_TypeError,
+                        "__qualname__ must be set to a string object");
+        return -1;
+    }
+    tmp = op->func_qualname;
+    Py_INCREF(value);
+    op->func_qualname = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_self(__pyx_CyFunctionObject *m, CYTHON_UNUSED void *closure)
+{
+    PyObject *self;
+    self = m->func_closure;
+    if (self == NULL)
+        self = Py_None;
+    Py_INCREF(self);
+    return self;
+}
+static PyObject *
+__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op)
+{
+    if (unlikely(op->func_dict == NULL)) {
+        op->func_dict = PyDict_New();
+        if (unlikely(op->func_dict == NULL))
+            return NULL;
+    }
+    Py_INCREF(op->func_dict);
+    return op->func_dict;
+}
+static int
+__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value)
+{
+    PyObject *tmp;
+    if (unlikely(value == NULL)) {
+        PyErr_SetString(PyExc_TypeError,
+               "function's dictionary may not be deleted");
+        return -1;
+    }
+    if (unlikely(!PyDict_Check(value))) {
+        PyErr_SetString(PyExc_TypeError,
+               "setting function's dictionary to a non-dict");
+        return -1;
+    }
+    tmp = op->func_dict;
+    Py_INCREF(value);
+    op->func_dict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op)
+{
+    Py_INCREF(op->func_globals);
+    return op->func_globals;
+}
+static PyObject *
+__Pyx_CyFunction_get_closure(CYTHON_UNUSED __pyx_CyFunctionObject *op)
+{
+    Py_INCREF(Py_None);
+    return Py_None;
+}
+static PyObject *
+__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op)
+{
+    PyObject* result = (op->func_code) ? op->func_code : Py_None;
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
+    int result = 0;
+    PyObject *res = op->defaults_getter((PyObject *) op);
+    if (unlikely(!res))
+        return -1;
+    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
+    Py_INCREF(op->defaults_tuple);
+    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
+    Py_INCREF(op->defaults_kwdict);
+    #else
+    op->defaults_tuple = PySequence_ITEM(res, 0);
+    if (unlikely(!op->defaults_tuple)) result = -1;
+    else {
+        op->defaults_kwdict = PySequence_ITEM(res, 1);
+        if (unlikely(!op->defaults_kwdict)) result = -1;
+    }
+    #endif
+    Py_DECREF(res);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyTuple_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__defaults__ must be set to a tuple object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_tuple;
+    op->defaults_tuple = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_tuple;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_tuple;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value) {
+        value = Py_None;
+    } else if (value != Py_None && !PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__kwdefaults__ must be set to a dict object");
+        return -1;
+    }
+    Py_INCREF(value);
+    tmp = op->defaults_kwdict;
+    op->defaults_kwdict = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->defaults_kwdict;
+    if (unlikely(!result)) {
+        if (op->defaults_getter) {
+            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
+            result = op->defaults_kwdict;
+        } else {
+            result = Py_None;
+        }
+    }
+    Py_INCREF(result);
+    return result;
+}
+static int
+__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value) {
+    PyObject* tmp;
+    if (!value || value == Py_None) {
+        value = NULL;
+    } else if (!PyDict_Check(value)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__annotations__ must be set to a dict object");
+        return -1;
+    }
+    Py_XINCREF(value);
+    tmp = op->func_annotations;
+    op->func_annotations = value;
+    Py_XDECREF(tmp);
+    return 0;
+}
+static PyObject *
+__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op) {
+    PyObject* result = op->func_annotations;
+    if (unlikely(!result)) {
+        result = PyDict_New();
+        if (unlikely(!result)) return NULL;
+        op->func_annotations = result;
+    }
+    Py_INCREF(result);
+    return result;
+}
+static PyGetSetDef __pyx_CyFunction_getsets[] = {
+    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
+    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
+    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
+    {(char *) "__self__", (getter)__Pyx_CyFunction_get_self, 0, 0, 0},
+    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
+    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
+    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
+    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
+    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
+    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
+    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
+    {0, 0, 0, 0, 0}
+};
+static PyMemberDef __pyx_CyFunction_members[] = {
+    {(char *) "__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), PY_WRITE_RESTRICTED, 0},
+    {0, 0, 0,  0, 0}
+};
+static PyObject *
+__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, CYTHON_UNUSED PyObject *args)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromString(m->func.m_ml->ml_name);
+#else
+    return PyString_FromString(m->func.m_ml->ml_name);
+#endif
+}
+static PyMethodDef __pyx_CyFunction_methods[] = {
+    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
+    {0, 0, 0, 0}
+};
+#if PY_VERSION_HEX < 0x030500A0
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
+#else
+#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func.m_weakreflist)
+#endif
+static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int flags, PyObject* qualname,
+                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
+    __pyx_CyFunctionObject *op = PyObject_GC_New(__pyx_CyFunctionObject, type);
+    if (op == NULL)
+        return NULL;
+    op->flags = flags;
+    __Pyx_CyFunction_weakreflist(op) = NULL;
+    op->func.m_ml = ml;
+    op->func.m_self = (PyObject *) op;
+    Py_XINCREF(closure);
+    op->func_closure = closure;
+    Py_XINCREF(module);
+    op->func.m_module = module;
+    op->func_dict = NULL;
+    op->func_name = NULL;
+    Py_INCREF(qualname);
+    op->func_qualname = qualname;
+    op->func_doc = NULL;
+    op->func_classobj = NULL;
+    op->func_globals = globals;
+    Py_INCREF(op->func_globals);
+    Py_XINCREF(code);
+    op->func_code = code;
+    op->defaults_pyobjects = 0;
+    op->defaults = NULL;
+    op->defaults_tuple = NULL;
+    op->defaults_kwdict = NULL;
+    op->defaults_getter = NULL;
+    op->func_annotations = NULL;
+    PyObject_GC_Track(op);
+    return (PyObject *) op;
+}
+static int
+__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
+{
+    Py_CLEAR(m->func_closure);
+    Py_CLEAR(m->func.m_module);
+    Py_CLEAR(m->func_dict);
+    Py_CLEAR(m->func_name);
+    Py_CLEAR(m->func_qualname);
+    Py_CLEAR(m->func_doc);
+    Py_CLEAR(m->func_globals);
+    Py_CLEAR(m->func_code);
+    Py_CLEAR(m->func_classobj);
+    Py_CLEAR(m->defaults_tuple);
+    Py_CLEAR(m->defaults_kwdict);
+    Py_CLEAR(m->func_annotations);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_XDECREF(pydefaults[i]);
+        PyObject_Free(m->defaults);
+        m->defaults = NULL;
+    }
+    return 0;
+}
+static void __Pyx__CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    if (__Pyx_CyFunction_weakreflist(m) != NULL)
+        PyObject_ClearWeakRefs((PyObject *) m);
+    __Pyx_CyFunction_clear(m);
+    PyObject_GC_Del(m);
+}
+static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
+{
+    PyObject_GC_UnTrack(m);
+    __Pyx__CyFunction_dealloc(m);
+}
+static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
+{
+    Py_VISIT(m->func_closure);
+    Py_VISIT(m->func.m_module);
+    Py_VISIT(m->func_dict);
+    Py_VISIT(m->func_name);
+    Py_VISIT(m->func_qualname);
+    Py_VISIT(m->func_doc);
+    Py_VISIT(m->func_globals);
+    Py_VISIT(m->func_code);
+    Py_VISIT(m->func_classobj);
+    Py_VISIT(m->defaults_tuple);
+    Py_VISIT(m->defaults_kwdict);
+    if (m->defaults) {
+        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
+        int i;
+        for (i = 0; i < m->defaults_pyobjects; i++)
+            Py_VISIT(pydefaults[i]);
+    }
+    return 0;
+}
+static PyObject *__Pyx_CyFunction_descr_get(PyObject *func, PyObject *obj, PyObject *type)
+{
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    if (m->flags & __Pyx_CYFUNCTION_STATICMETHOD) {
+        Py_INCREF(func);
+        return func;
+    }
+    if (m->flags & __Pyx_CYFUNCTION_CLASSMETHOD) {
+        if (type == NULL)
+            type = (PyObject *)(Py_TYPE(obj));
+        return __Pyx_PyMethod_New(func, type, (PyObject *)(Py_TYPE(type)));
+    }
+    if (obj == Py_None)
+        obj = NULL;
+    return __Pyx_PyMethod_New(func, obj, type);
+}
+static PyObject*
+__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
+{
+#if PY_MAJOR_VERSION >= 3
+    return PyUnicode_FromFormat("<cyfunction %U at %p>",
+                                op->func_qualname, (void *)op);
+#else
+    return PyString_FromFormat("<cyfunction %s at %p>",
+                               PyString_AsString(op->func_qualname), (void *)op);
+#endif
+}
+static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
+    PyCFunctionObject* f = (PyCFunctionObject*)func;
+    PyCFunction meth = f->m_ml->ml_meth;
+    Py_ssize_t size;
+    switch (f->m_ml->ml_flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
+    case METH_VARARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0))
+            return (*meth)(self, arg);
+        break;
+    case METH_VARARGS | METH_KEYWORDS:
+        return (*(PyCFunctionWithKeywords)meth)(self, arg, kw);
+    case METH_NOARGS:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 0))
+                return (*meth)(self, NULL);
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    case METH_O:
+        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
+            size = PyTuple_GET_SIZE(arg);
+            if (likely(size == 1)) {
+                PyObject *result, *arg0;
+                #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
+                arg0 = PyTuple_GET_ITEM(arg, 0);
+                #else
+                arg0 = PySequence_ITEM(arg, 0); if (unlikely(!arg0)) return NULL;
+                #endif
+                result = (*meth)(self, arg0);
+                #if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
+                Py_DECREF(arg0);
+                #endif
+                return result;
+            }
+            PyErr_Format(PyExc_TypeError,
+                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                f->m_ml->ml_name, size);
+            return NULL;
+        }
+        break;
+    default:
+        PyErr_SetString(PyExc_SystemError, "Bad call flags in "
+                        "__Pyx_CyFunction_Call. METH_OLDARGS is no "
+                        "longer supported!");
+        return NULL;
+    }
+    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
+                 f->m_ml->ml_name);
+    return NULL;
+}
+static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    return __Pyx_CyFunction_CallMethod(func, ((PyCFunctionObject*)func)->m_self, arg, kw);
+}
+static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
+    PyObject *result;
+    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
+    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
+        Py_ssize_t argc;
+        PyObject *new_args;
+        PyObject *self;
+        argc = PyTuple_GET_SIZE(args);
+        new_args = PyTuple_GetSlice(args, 1, argc);
+        if (unlikely(!new_args))
+            return NULL;
+        self = PyTuple_GetItem(args, 0);
+        if (unlikely(!self)) {
+            Py_DECREF(new_args);
+            return NULL;
+        }
+        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
+        Py_DECREF(new_args);
+    } else {
+        result = __Pyx_CyFunction_Call(func, args, kw);
+    }
+    return result;
+}
+static PyTypeObject __pyx_CyFunctionType_type = {
+    PyVarObject_HEAD_INIT(0, 0)
+    "cython_function_or_method",
+    sizeof(__pyx_CyFunctionObject),
+    0,
+    (destructor) __Pyx_CyFunction_dealloc,
+    0,
+    0,
+    0,
+#if PY_MAJOR_VERSION < 3
+    0,
+#else
+    0,
+#endif
+    (reprfunc) __Pyx_CyFunction_repr,
+    0,
+    0,
+    0,
+    0,
+    __Pyx_CyFunction_CallAsMethod,
+    0,
+    0,
+    0,
+    0,
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
+    0,
+    (traverseproc) __Pyx_CyFunction_traverse,
+    (inquiry) __Pyx_CyFunction_clear,
+    0,
+#if PY_VERSION_HEX < 0x030500A0
+    offsetof(__pyx_CyFunctionObject, func_weakreflist),
+#else
+    offsetof(PyCFunctionObject, m_weakreflist),
+#endif
+    0,
+    0,
+    __pyx_CyFunction_methods,
+    __pyx_CyFunction_members,
+    __pyx_CyFunction_getsets,
+    0,
+    0,
+    __Pyx_CyFunction_descr_get,
+    0,
+    offsetof(__pyx_CyFunctionObject, func_dict),
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+#if PY_VERSION_HEX >= 0x030400a1
+    0,
+#endif
+};
+static int __pyx_CyFunction_init(void) {
+    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
+    if (unlikely(__pyx_CyFunctionType == NULL)) {
+        return -1;
+    }
+    return 0;
+}
+static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults = PyObject_Malloc(size);
+    if (unlikely(!m->defaults))
+        return PyErr_NoMemory();
+    memset(m->defaults, 0, size);
+    m->defaults_pyobjects = pyobjects;
+    return m->defaults;
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_tuple = tuple;
+    Py_INCREF(tuple);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->defaults_kwdict = dict;
+    Py_INCREF(dict);
+}
+static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
+    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
+    m->func_annotations = dict;
+    Py_INCREF(dict);
+}
+
+/* CalculateMetaclass */
+          static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases) {
+    Py_ssize_t i, nbases = PyTuple_GET_SIZE(bases);
+    for (i=0; i < nbases; i++) {
+        PyTypeObject *tmptype;
+        PyObject *tmp = PyTuple_GET_ITEM(bases, i);
+        tmptype = Py_TYPE(tmp);
+#if PY_MAJOR_VERSION < 3
+        if (tmptype == &PyClass_Type)
+            continue;
+#endif
+        if (!metaclass) {
+            metaclass = tmptype;
+            continue;
+        }
+        if (PyType_IsSubtype(metaclass, tmptype))
+            continue;
+        if (PyType_IsSubtype(tmptype, metaclass)) {
+            metaclass = tmptype;
+            continue;
+        }
+        PyErr_SetString(PyExc_TypeError,
+                        "metaclass conflict: "
+                        "the metaclass of a derived class "
+                        "must be a (non-strict) subclass "
+                        "of the metaclasses of all its bases");
+        return NULL;
+    }
+    if (!metaclass) {
+#if PY_MAJOR_VERSION < 3
+        metaclass = &PyClass_Type;
+#else
+        metaclass = &PyType_Type;
+#endif
+    }
+    Py_INCREF((PyObject*) metaclass);
+    return (PyObject*) metaclass;
+}
+
+/* Py3ClassCreate */
+          static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name,
+                                           PyObject *qualname, PyObject *mkw, PyObject *modname, PyObject *doc) {
+    PyObject *ns;
+    if (metaclass) {
+        PyObject *prep = __Pyx_PyObject_GetAttrStr(metaclass, __pyx_n_s_prepare);
+        if (prep) {
+            PyObject *pargs = PyTuple_Pack(2, name, bases);
+            if (unlikely(!pargs)) {
+                Py_DECREF(prep);
+                return NULL;
+            }
+            ns = PyObject_Call(prep, pargs, mkw);
+            Py_DECREF(prep);
+            Py_DECREF(pargs);
+        } else {
+            if (unlikely(!PyErr_ExceptionMatches(PyExc_AttributeError)))
+                return NULL;
+            PyErr_Clear();
+            ns = PyDict_New();
+        }
+    } else {
+        ns = PyDict_New();
+    }
+    if (unlikely(!ns))
+        return NULL;
+    if (unlikely(PyObject_SetItem(ns, __pyx_n_s_module, modname) < 0)) goto bad;
+    if (unlikely(PyObject_SetItem(ns, __pyx_n_s_qualname, qualname) < 0)) goto bad;
+    if (unlikely(doc && PyObject_SetItem(ns, __pyx_n_s_doc, doc) < 0)) goto bad;
+    return ns;
+bad:
+    Py_DECREF(ns);
+    return NULL;
+}
+static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases,
+                                      PyObject *dict, PyObject *mkw,
+                                      int calculate_metaclass, int allow_py2_metaclass) {
+    PyObject *result, *margs;
+    PyObject *owned_metaclass = NULL;
+    if (allow_py2_metaclass) {
+        owned_metaclass = PyObject_GetItem(dict, __pyx_n_s_metaclass);
+        if (owned_metaclass) {
+            metaclass = owned_metaclass;
+        } else if (likely(PyErr_ExceptionMatches(PyExc_KeyError))) {
+            PyErr_Clear();
+        } else {
+            return NULL;
+        }
+    }
+    if (calculate_metaclass && (!metaclass || PyType_Check(metaclass))) {
+        metaclass = __Pyx_CalculateMetaclass((PyTypeObject*) metaclass, bases);
+        Py_XDECREF(owned_metaclass);
+        if (unlikely(!metaclass))
+            return NULL;
+        owned_metaclass = metaclass;
+    }
+    margs = PyTuple_Pack(3, name, bases, dict);
+    if (unlikely(!margs)) {
+        result = NULL;
+    } else {
+        result = PyObject_Call(metaclass, margs, mkw);
+        Py_DECREF(margs);
+    }
+    Py_XDECREF(owned_metaclass);
+    return result;
+}
+
+/* PyErrFetchRestore */
+          #if CYTHON_FAST_THREAD_STATE
+static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+}
+static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+}
+#endif
+
+/* CLineInTraceback */
+          #ifndef CYTHON_CLINE_IN_TRACEBACK
+static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) {
+    PyObject *use_cline;
+    PyObject *ptype, *pvalue, *ptraceback;
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject **cython_runtime_dict;
+#endif
+    if (unlikely(!__pyx_cython_runtime)) {
+        return c_line;
+    }
+    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
+#if CYTHON_COMPILING_IN_CPYTHON
+    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
+    if (likely(cython_runtime_dict)) {
+      use_cline = __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback);
+    } else
+#endif
+    {
+      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
+      if (use_cline_obj) {
+        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
+        Py_DECREF(use_cline_obj);
+      } else {
+        PyErr_Clear();
+        use_cline = NULL;
+      }
+    }
+    if (!use_cline) {
+        c_line = 0;
+        PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
+    }
+    else if (PyObject_Not(use_cline) != 0) {
+        c_line = 0;
+    }
+    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
+    return c_line;
+}
+#endif
+
+/* CodeObjectCache */
+          static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = start + (end - start) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+/* AddTraceback */
+          #include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,
+        0,
+        0,
+        0,
+        0,
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyFrameObject *py_frame = 0;
+    PyThreadState *tstate = __Pyx_PyThreadState_Current;
+    if (c_line) {
+        c_line = __Pyx_CLineForTraceback(tstate, c_line);
+    }
+    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
+    }
+    py_frame = PyFrame_New(
+        tstate,            /*PyThreadState *tstate,*/
+        py_code,           /*PyCodeObject *code,*/
+        __pyx_d,    /*PyObject *globals,*/
+        0                  /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+/* CIntToPy */
+          static CYTHON_INLINE PyObject* __Pyx_PyInt_From_gdf_dtype(gdf_dtype value) {
+    const gdf_dtype neg_one = (gdf_dtype) -1, const_zero = (gdf_dtype) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(gdf_dtype) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(gdf_dtype) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(gdf_dtype) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(gdf_dtype) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(gdf_dtype) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(gdf_dtype),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPyVerify */
+          #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
+#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
+    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
+#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
+    {\
+        func_type value = func_value;\
+        if (sizeof(target_type) < sizeof(func_type)) {\
+            if (unlikely(value != (func_type) (target_type) value)) {\
+                func_type zero = 0;\
+                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
+                    return (target_type) -1;\
+                if (is_unsigned && unlikely(value < zero))\
+                    goto raise_neg_overflow;\
+                else\
+                    goto raise_overflow;\
+            }\
+        }\
+        return (target_type) value;\
+    }
+
+/* Print */
+          #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
+static PyObject *__Pyx_GetStdout(void) {
+    PyObject *f = PySys_GetObject((char *)"stdout");
+    if (!f) {
+        PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
+    }
+    return f;
+}
+static int __Pyx_Print(PyObject* f, PyObject *arg_tuple, int newline) {
+    int i;
+    if (!f) {
+        if (!(f = __Pyx_GetStdout()))
+            return -1;
+    }
+    Py_INCREF(f);
+    for (i=0; i < PyTuple_GET_SIZE(arg_tuple); i++) {
+        PyObject* v;
+        if (PyFile_SoftSpace(f, 1)) {
+            if (PyFile_WriteString(" ", f) < 0)
+                goto error;
+        }
+        v = PyTuple_GET_ITEM(arg_tuple, i);
+        if (PyFile_WriteObject(v, f, Py_PRINT_RAW) < 0)
+            goto error;
+        if (PyString_Check(v)) {
+            char *s = PyString_AsString(v);
+            Py_ssize_t len = PyString_Size(v);
+            if (len > 0) {
+                switch (s[len-1]) {
+                    case ' ': break;
+                    case '\f': case '\r': case '\n': case '\t': case '\v':
+                        PyFile_SoftSpace(f, 0);
+                        break;
+                    default:  break;
+                }
+            }
+        }
+    }
+    if (newline) {
+        if (PyFile_WriteString("\n", f) < 0)
+            goto error;
+        PyFile_SoftSpace(f, 0);
+    }
+    Py_DECREF(f);
+    return 0;
+error:
+    Py_DECREF(f);
+    return -1;
+}
+#else
+static int __Pyx_Print(PyObject* stream, PyObject *arg_tuple, int newline) {
+    PyObject* kwargs = 0;
+    PyObject* result = 0;
+    PyObject* end_string;
+    if (unlikely(!__pyx_print)) {
+        __pyx_print = PyObject_GetAttr(__pyx_b, __pyx_n_s_print);
+        if (!__pyx_print)
+            return -1;
+    }
+    if (stream) {
+        kwargs = PyDict_New();
+        if (unlikely(!kwargs))
+            return -1;
+        if (unlikely(PyDict_SetItem(kwargs, __pyx_n_s_file, stream) < 0))
+            goto bad;
+        if (!newline) {
+            end_string = PyUnicode_FromStringAndSize(" ", 1);
+            if (unlikely(!end_string))
+                goto bad;
+            if (PyDict_SetItem(kwargs, __pyx_n_s_end, end_string) < 0) {
+                Py_DECREF(end_string);
+                goto bad;
+            }
+            Py_DECREF(end_string);
+        }
+    } else if (!newline) {
+        if (unlikely(!__pyx_print_kwargs)) {
+            __pyx_print_kwargs = PyDict_New();
+            if (unlikely(!__pyx_print_kwargs))
+                return -1;
+            end_string = PyUnicode_FromStringAndSize(" ", 1);
+            if (unlikely(!end_string))
+                return -1;
+            if (PyDict_SetItem(__pyx_print_kwargs, __pyx_n_s_end, end_string) < 0) {
+                Py_DECREF(end_string);
+                return -1;
+            }
+            Py_DECREF(end_string);
+        }
+        kwargs = __pyx_print_kwargs;
+    }
+    result = PyObject_Call(__pyx_print, arg_tuple, kwargs);
+    if (unlikely(kwargs) && (kwargs != __pyx_print_kwargs))
+        Py_DECREF(kwargs);
+    if (!result)
+        return -1;
+    Py_DECREF(result);
+    return 0;
+bad:
+    if (kwargs != __pyx_print_kwargs)
+        Py_XDECREF(kwargs);
+    return -1;
+}
+#endif
+
+/* CIntFromPy */
+          static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) {
+    const size_t neg_one = (size_t) -1, const_zero = (size_t) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(size_t) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (size_t) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(size_t, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 2 * PyLong_SHIFT) {
+                            return (size_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 3 * PyLong_SHIFT) {
+                            return (size_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) >= 4 * PyLong_SHIFT) {
+                            return (size_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (size_t) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (size_t) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(size_t, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(size_t,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(size_t) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                            return (size_t) ((((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                            return (size_t) ((((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) (((size_t)-1)*(((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
+                            return (size_t) ((((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(size_t) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(size_t) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(size_t, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            size_t val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (size_t) -1;
+        }
+    } else {
+        size_t val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (size_t) -1;
+        val = __Pyx_PyInt_As_size_t(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to size_t");
+    return (size_t) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to size_t");
+    return (size_t) -1;
+}
+
+/* CIntFromPy */
+          static CYTHON_INLINE gdf_dtype __Pyx_PyInt_As_gdf_dtype(PyObject *x) {
+    const gdf_dtype neg_one = (gdf_dtype) -1, const_zero = (gdf_dtype) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(gdf_dtype) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (gdf_dtype) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (gdf_dtype) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(gdf_dtype, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(gdf_dtype) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) >= 2 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(gdf_dtype) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) >= 3 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(gdf_dtype) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) >= 4 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (gdf_dtype) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(gdf_dtype) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(gdf_dtype) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (gdf_dtype) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(gdf_dtype, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(gdf_dtype,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(gdf_dtype) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((gdf_dtype)-1)*(((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(gdf_dtype) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
+                            return (gdf_dtype) ((((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((gdf_dtype)-1)*(((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(gdf_dtype) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
+                            return (gdf_dtype) ((((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 4 * PyLong_SHIFT) {
+                            return (gdf_dtype) (((gdf_dtype)-1)*(((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(gdf_dtype) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(gdf_dtype) - 1 > 4 * PyLong_SHIFT) {
+                            return (gdf_dtype) ((((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(gdf_dtype) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(gdf_dtype) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            gdf_dtype val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (gdf_dtype) -1;
+        }
+    } else {
+        gdf_dtype val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (gdf_dtype) -1;
+        val = __Pyx_PyInt_As_gdf_dtype(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to gdf_dtype");
+    return (gdf_dtype) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to gdf_dtype");
+    return (gdf_dtype) -1;
+}
+
+/* CIntFromPy */
+          static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = (int) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
+                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
+                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
+                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (int) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (int) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
+                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to int");
+    return (int) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to int");
+    return (int) -1;
+}
+
+/* PrintOne */
+          #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
+static int __Pyx_PrintOne(PyObject* f, PyObject *o) {
+    if (!f) {
+        if (!(f = __Pyx_GetStdout()))
+            return -1;
+    }
+    Py_INCREF(f);
+    if (PyFile_SoftSpace(f, 0)) {
+        if (PyFile_WriteString(" ", f) < 0)
+            goto error;
+    }
+    if (PyFile_WriteObject(o, f, Py_PRINT_RAW) < 0)
+        goto error;
+    if (PyFile_WriteString("\n", f) < 0)
+        goto error;
+    Py_DECREF(f);
+    return 0;
+error:
+    Py_DECREF(f);
+    return -1;
+    /* the line below is just to avoid C compiler
+     * warnings about unused functions */
+    return __Pyx_Print(f, NULL, 0);
+}
+#else
+static int __Pyx_PrintOne(PyObject* stream, PyObject *o) {
+    int res;
+    PyObject* arg_tuple = PyTuple_Pack(1, o);
+    if (unlikely(!arg_tuple))
+        return -1;
+    res = __Pyx_Print(stream, arg_tuple, 1);
+    Py_DECREF(arg_tuple);
+    return res;
+}
+#endif
+
+/* CIntToPy */
+          static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
+#endif
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+#ifdef HAVE_LONG_LONG
+        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+            return PyLong_FromLongLong((PY_LONG_LONG) value);
+#endif
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+/* CIntFromPy */
+          static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = (long) 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                goto raise_neg_overflow;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
+                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
+                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
+                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
+                        }
+                    }
+                    break;
+            }
+#endif
+#if CYTHON_COMPILING_IN_CPYTHON
+            if (unlikely(Py_SIZE(x) < 0)) {
+                goto raise_neg_overflow;
+            }
+#else
+            {
+                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
+                if (unlikely(result < 0))
+                    return (long) -1;
+                if (unlikely(result == 1))
+                    goto raise_neg_overflow;
+            }
+#endif
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
+#endif
+            }
+        } else {
+#if CYTHON_USE_PYLONG_INTERNALS
+            const digit* digits = ((PyLongObject*)x)->ob_digit;
+            switch (Py_SIZE(x)) {
+                case  0: return (long) 0;
+                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
+                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
+                case -2:
+                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 2:
+                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -3:
+                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 3:
+                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case -4:
+                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+                case 4:
+                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
+                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
+                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
+                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
+                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
+                        }
+                    }
+                    break;
+            }
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
+#ifdef HAVE_LONG_LONG
+            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
+                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
+#endif
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+raise_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "value too large to convert to long");
+    return (long) -1;
+raise_neg_overflow:
+    PyErr_SetString(PyExc_OverflowError,
+        "can't convert negative value to long");
+    return (long) -1;
+}
+
+/* FastTypeChecks */
+          #if CYTHON_COMPILING_IN_CPYTHON
+static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
+    while (a) {
+        a = a->tp_base;
+        if (a == b)
+            return 1;
+    }
+    return b == &PyBaseObject_Type;
+}
+static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
+    PyObject *mro;
+    if (a == b) return 1;
+    mro = a->tp_mro;
+    if (likely(mro)) {
+        Py_ssize_t i, n;
+        n = PyTuple_GET_SIZE(mro);
+        for (i = 0; i < n; i++) {
+            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
+                return 1;
+        }
+        return 0;
+    }
+    return __Pyx_InBases(a, b);
+}
+#if PY_MAJOR_VERSION == 2
+static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
+    PyObject *exception, *value, *tb;
+    int res;
+    __Pyx_PyThreadState_declare
+    __Pyx_PyThreadState_assign
+    __Pyx_ErrFetch(&exception, &value, &tb);
+    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
+    if (unlikely(res == -1)) {
+        PyErr_WriteUnraisable(err);
+        res = 0;
+    }
+    if (!res) {
+        res = PyObject_IsSubclass(err, exc_type2);
+        if (unlikely(res == -1)) {
+            PyErr_WriteUnraisable(err);
+            res = 0;
+        }
+    }
+    __Pyx_ErrRestore(exception, value, tb);
+    return res;
+}
+#else
+static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
+    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
+    if (!res) {
+        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
+    }
+    return res;
+}
+#endif
+static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
+    Py_ssize_t i, n;
+    assert(PyExceptionClass_Check(exc_type));
+    n = PyTuple_GET_SIZE(tuple);
+#if PY_MAJOR_VERSION >= 3
+    for (i=0; i<n; i++) {
+        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
+    }
+#endif
+    for (i=0; i<n; i++) {
+        PyObject *t = PyTuple_GET_ITEM(tuple, i);
+        #if PY_MAJOR_VERSION < 3
+        if (likely(exc_type == t)) return 1;
+        #endif
+        if (likely(PyExceptionClass_Check(t))) {
+            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
+        } else {
+        }
+    }
+    return 0;
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
+    if (likely(err == exc_type)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        if (likely(PyExceptionClass_Check(exc_type))) {
+            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
+        } else if (likely(PyTuple_Check(exc_type))) {
+            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
+        } else {
+        }
+    }
+    return PyErr_GivenExceptionMatches(err, exc_type);
+}
+static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
+    assert(PyExceptionClass_Check(exc_type1));
+    assert(PyExceptionClass_Check(exc_type2));
+    if (likely(err == exc_type1 || err == exc_type2)) return 1;
+    if (likely(PyExceptionClass_Check(err))) {
+        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
+    }
+    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
+}
+#endif
+
+/* CheckBinaryVersion */
+          static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        return PyErr_WarnEx(NULL, message, 1);
+    }
+    return 0;
+}
+
+/* InitStrings */
+          static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        if (PyObject_Hash(*t->p) == -1)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
+}
+static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+#if !CYTHON_PEP393_ENABLED
+static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    char* defenc_c;
+    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+    if (!defenc) return NULL;
+    defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    {
+        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+        char* c;
+        for (c = defenc_c; c < end; c++) {
+            if ((unsigned char) (*c) >= 128) {
+                PyUnicode_AsASCIIString(o);
+                return NULL;
+            }
+        }
+    }
+#endif
+    *length = PyBytes_GET_SIZE(defenc);
+    return defenc_c;
+}
+#else
+static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+    if (likely(PyUnicode_IS_ASCII(o))) {
+        *length = PyUnicode_GET_LENGTH(o);
+        return PyUnicode_AsUTF8(o);
+    } else {
+        PyUnicode_AsASCIIString(o);
+        return NULL;
+    }
+#else
+    return PyUnicode_AsUTF8AndSize(o, length);
+#endif
+}
+#endif
+#endif
+static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+        return __Pyx_PyUnicode_AsStringAndSize(o, length);
+    } else
+#endif
+#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
+#if PY_MAJOR_VERSION >= 3
+    if (PyLong_Check(result)) {
+        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                "__int__ returned non-int (type %.200s).  "
+                "The ability to return an instance of a strict subclass of int "
+                "is deprecated, and may be removed in a future version of Python.",
+                Py_TYPE(result)->tp_name)) {
+            Py_DECREF(result);
+            return NULL;
+        }
+        return result;
+    }
+#endif
+    PyErr_Format(PyExc_TypeError,
+                 "__%.4s__ returned non-%.4s (type %.200s)",
+                 type_name, type_name, Py_TYPE(result)->tp_name);
+    Py_DECREF(result);
+    return NULL;
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
+#if CYTHON_USE_TYPE_SLOTS
+  PyNumberMethods *m;
+#endif
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_Check(x) || PyLong_Check(x)))
+#else
+  if (likely(PyLong_Check(x)))
+#endif
+    return __Pyx_NewRef(x);
+#if CYTHON_USE_TYPE_SLOTS
+  m = Py_TYPE(x)->tp_as_number;
+  #if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = m->nb_long(x);
+  }
+  #else
+  if (likely(m && m->nb_int)) {
+    name = "int";
+    res = m->nb_int(x);
+  }
+  #endif
+#else
+  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
+    res = PyNumber_Int(x);
+  }
+#endif
+  if (likely(res)) {
+#if PY_MAJOR_VERSION < 3
+    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
+#else
+    if (unlikely(!PyLong_CheckExact(res))) {
+#endif
+        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b))) {
+    if (sizeof(Py_ssize_t) >= sizeof(long))
+        return PyInt_AS_LONG(b);
+    else
+        return PyInt_AsSsize_t(x);
+  }
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_USE_PYLONG_INTERNALS
+    const digit* digits = ((PyLongObject*)b)->ob_digit;
+    const Py_ssize_t size = Py_SIZE(b);
+    if (likely(__Pyx_sst_abs(size) <= 1)) {
+        ival = likely(size) ? digits[0] : 0;
+        if (size == -1) ival = -ival;
+        return ival;
+    } else {
+      switch (size) {
+         case 2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -2:
+           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -3:
+           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case 4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+         case -4:
+           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
+             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
+           }
+           break;
+      }
+    }
+    #endif
+    return PyLong_AsSsize_t(b);
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
+  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+    return PyInt_FromSize_t(ival);
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/python/bfs/bfs_wrapper.pyx b/python/bfs/bfs_wrapper.pyx
new file mode 100644
index 00000000000..f0e23b6bc96
--- /dev/null
+++ b/python/bfs/bfs_wrapper.pyx
@@ -0,0 +1,193 @@
+from c_bfs cimport *
+from libcpp cimport bool
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport calloc, malloc, free
+import cudf
+from librmm_cffi import librmm as rmm
+#from pygdf import Column
+import numpy as np
+
+dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
+
+def _get_ctype_ptr(obj):
+    # The manner to access the pointers in the gdf's might change, so
+    # encapsulating access in the following 3 methods. They might also be
+    # part of future gdf versions.
+    return obj.device_ctypes_pointer.value
+
+def _get_column_data_ptr(obj):
+    return _get_ctype_ptr(obj._column._data.to_gpu_array())
+
+def _get_column_valid_ptr(obj):
+    return _get_ctype_ptr(obj._column._mask.to_gpu_array())
+
+#def _get_gdf_as_matrix_ptr(gdf):
+#    return self._get_ctype_ptr(gdf.as_gpu_matrix())
+
+cdef create_column(col):
+    
+    x= <gdf_column*>malloc(sizeof(gdf_column))
+    cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
+    cdef uintptr_t data_ptr = _get_column_data_ptr(col)
+    #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
+
+    gdf_column_view_augmented(<gdf_column*>c_col,
+                              <void*> data_ptr,
+                              <gdf_valid_type*> 0,
+                              <gdf_size_type>len(col),
+                              dtypes[col.dtype.type],
+                              <gdf_size_type>col.null_count)
+    
+    cdef uintptr_t col_ptr = <uintptr_t>c_col
+    return col_ptr
+
+class Graph:
+    """
+        cuGraph graph class containing basic graph creation and transformation operations.
+    """
+    def __init__(self):
+        """
+        Returns
+        -------
+        Graph : cuGraph.Graph.
+
+        Examples
+        --------
+        >>> import cuGraph
+        >>> G = cuGraph.Graph()
+        """
+        cdef gdf_graph* graph
+        graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))
+
+        cdef uintptr_t graph_ptr = <uintptr_t>graph
+        self.graph_ptr = graph_ptr
+
+
+    def add_edge_list(self, source_col, dest_col, value_col=None):
+        """
+        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. 
+        The cuGraph graph should not already contain the connectivity information as an edge list.
+        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).
+
+        Parameters
+        ----------
+        source_indices : gdf_column       
+            This gdf_column of size E (number of edges) contains the index of the source for each edge.
+            Indices must be in the range [0, V-1]. 
+        destination_indices   : gdf_column
+            This gdf_column of size E (number of edges) contains the index of the destination for each edge. 
+            Indices must be in the range [0, V-1].
+        edge_data (optional)  : gdf_column
+            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. 
+            The type expected to be floating point.
+
+        Examples
+        --------
+        >>> import cuGraph
+        >>> import cudf
+        >>> from scipy.io import mmread
+        >>> M = ReadMtxFile(graph_file)
+        >>> sources = cudf.Series(M.row)
+        >>> destinations = cudf.Series(M.col)
+        >>> G = cuGraph.Graph()
+        >>> G.add_edge_list(sources,destinations,none)
+        
+        """
+
+        cdef uintptr_t graph = self.graph_ptr
+        cdef uintptr_t source=create_column(source_col)
+        cdef uintptr_t dest=create_column(dest_col)
+        cdef uintptr_t value
+        if value_col is None:
+            value = 0
+        else:
+            value=create_column(value_col)
+
+        gdf_edge_list_view(<gdf_graph*>graph,
+                       <gdf_column*>source,
+                       <gdf_column*>dest,
+                       <gdf_column*>value)
+    
+    def view_edge_list(self):
+        ##TO DO
+        """
+        Display the edge list.
+        """
+        cdef uintptr_t graph = self.graph_ptr
+        cdef gdf_graph* g = <gdf_graph*>graph
+        size = g.edgeList.src_indices.size
+        print(size)
+        cdef object cffi_view = <object>g.edgeList.src_indices
+        data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
+        #return pygdf.Series(data)        
+        return 0
+
+    def add_adj_list(self, offsets_col, indices_col, value_col):
+        """
+        Warp existing gdf columns representing an adjacency list in a gdf_graph.
+        """
+        ##TO TEST
+        cdef uintptr_t graph = self.graph_ptr
+        cdef uintptr_t offsets=create_column(offsets_col)
+        cdef uintptr_t indices=create_column(indices_col)
+        cdef uintptr_t value
+        if value_col is None:
+            value = 0
+        else:
+            value=create_column(value_col)
+    
+        gdf_adj_list_view(<gdf_graph*>graph,
+                       <gdf_column*>offsets,
+                       <gdf_column*>indices,
+                       <gdf_column*>value)
+
+
+    def add_transpose(self):
+        """
+        Compute the transposed adjacency list from the edge list and add it to the existing graph.
+        """
+        cdef uintptr_t graph = self.graph_ptr
+        gdf_add_transpose(<gdf_graph*>graph)
+
+cpdef bfs(G, start, directed=True):
+    """
+    Find the distances and predecessors for a breadth first traversal of a graph.
+    
+    Parameters
+    ----------
+    G : cugraph.graph
+        cuGraph graph descriptor, should contain the connectivity information as an
+        adjacency list.
+    start : Integer
+        The index of the graph vertex from which the traversal begins
+    directed : bool
+        Indicates whether the graph in question is a directed graph, or whether
+        each edge has a corresponding reverse edge. (Allows optimizations if the
+        graph is undirected)
+    
+    Returns
+    -------
+    distances, predecessors : cudf.Series
+        distances gives the path distance for each vertex from the starting vertex
+        predecessors gives for each vertex the vertex it was reached from in the traversal
+        
+    Examples
+    --------
+    >>> M = ReadMtxFile(graph_file)
+    >>> sources = cudf.Series(M.row)
+    >>> destinations = cudf.Series(M.col)
+    >>> G = cuGraph.Graph()
+    >>> G.add_edge_list(sources,destinations,none)
+    >>> dist, pred = cuGraph.bfs(G, 0, false)
+    """
+    
+    cdef uintptr_t graph = G.graph_ptr
+    cdef gdf_graph* g = <gdf_graph*>graph
+    num_verts = g.adjList.offsets.size - 1
+    distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    cdef uintptr_t distances_ptr = create_column(distances)
+    predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    cdef uintptr_t predecessors_ptr = create_column(distances)
+    
+    gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
+    return distances, predecessors
\ No newline at end of file
diff --git a/python/bfs/c_bfs.pxd b/python/bfs/c_bfs.pxd
new file mode 100644
index 00000000000..ec4e8f8808e
--- /dev/null
+++ b/python/bfs/c_bfs.pxd
@@ -0,0 +1,75 @@
+
+from libcpp cimport bool
+
+cdef extern from "cudf.h":
+
+    ctypedef enum gdf_error: 
+        pass
+
+    ctypedef enum gdf_dtype:
+        GDF_invalid=0,
+        GDF_INT8,
+        GDF_INT16,
+        GDF_INT32,
+        GDF_INT64,
+        GDF_FLOAT32,
+        GDF_FLOAT64,
+        GDF_DATE32,     
+        GDF_DATE64,     
+        GDF_TIMESTAMP,  
+        GDF_CATEGORY,
+        GDF_STRING,
+        N_GDF_TYPES
+
+    ctypedef unsigned char gdf_valid_type
+    ctypedef size_t gdf_size_type
+ 
+    struct gdf_column_:
+        void *data                       
+        gdf_valid_type *valid
+        gdf_size_type size             
+        gdf_dtype dtype
+
+
+    ctypedef gdf_column_ gdf_column
+
+    cdef gdf_error gdf_column_view_augmented(gdf_column *column, 
+                              void *data, 
+                              gdf_valid_type *valid,
+                              gdf_size_type size, 
+                              gdf_dtype dtype,
+                              gdf_size_type null_count)
+
+    cdef gdf_error gdf_column_view_new(gdf_column *column,
+                              void *data)
+
+cdef extern from "cugraph.h":
+
+    struct gdf_edge_list:
+        gdf_column *src_indices
+        gdf_column *dest_indices
+        gdf_column *edge_data
+
+    struct gdf_adj_list:
+        gdf_column *offsets
+        gdf_column *indices
+        gdf_column *edge_data
+
+    struct gdf_graph:
+        gdf_edge_list *edgeList
+        gdf_adj_list *adjList
+        gdf_adj_list *transposedAdjList
+        
+    cdef gdf_error gdf_edge_list_view(gdf_graph *graph, 
+                             const gdf_column *source_indices,
+                             const gdf_column *destination_indices,
+                             const gdf_column *edge_data)
+    
+    cdef gdf_error gdf_adj_list_view (gdf_graph *graph, 
+                             const gdf_column *offsets,
+                             const gdf_column *indices,
+                             const gdf_column *edge_data)
+
+    cdef gdf_error gdf_add_transpose(gdf_graph *graph)
+
+    cdef gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_node, bool directed)
diff --git a/setup.py b/setup.py
index af52dedb995..543937c74da 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 # temporary fix. cudf 0.5 will have a cudf.get_include()
 cudf_include = os.path.normpath(sys.prefix) + '/include'
 
-cython_files = ['python/pagerank/pagerank_wrapper.pyx']
+cython_files = ['python/pagerank/pagerank_wrapper.pyx', 'python/bfs/bfs_wrapper.pyx']
 
 extensions = [
     Extension("cugraph",
diff --git a/src/bfs.cu b/src/bfs.cu
new file mode 100644
index 00000000000..aa5490460fb
--- /dev/null
+++ b/src/bfs.cu
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#include <algorithm>
+#include <iomanip>
+#include "bfs.cuh"
+#include <limits>
+
+#include "graph_utils.cuh"
+#include "bfs_kernels.cuh"
+
+using namespace bfs_kernels;
+
+namespace cugraph {
+	enum BFS_ALGO_STATE {
+		TOPDOWN, BOTTOMUP
+	};
+
+	template<typename IndexType>
+	void Bfs<IndexType>::setup() {
+
+		// Determinism flag, false by default
+		deterministic = false;
+		//Working data
+		//Each vertex can be in the frontier at most once
+		cudaMalloc(&frontier, n * sizeof(IndexType));
+
+		//We will update frontier during the execution
+		//We need the orig to reset frontier, or cudaFree
+		original_frontier = frontier;
+
+		//size of bitmaps for vertices
+		vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
+		//ith bit of visited_bmap is set <=> ith vertex is visited
+		cudaMalloc(&visited_bmap, sizeof(int) * vertices_bmap_size);
+
+		//ith bit of isolated_bmap is set <=> degree of ith vertex = 0
+		cudaMalloc(&isolated_bmap, sizeof(int) * vertices_bmap_size);
+
+		//vertices_degree[i] = degree of vertex i
+		cudaMalloc(&vertex_degree, sizeof(IndexType) * n);
+
+		//Cub working data
+		cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
+
+		//We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
+		cudaMalloc(&buffer_np1_1, (n + 1) * sizeof(IndexType));
+		cudaMalloc(&buffer_np1_2, (n + 1) * sizeof(IndexType));
+
+		//Using buffers : top down
+
+		//frontier_vertex_degree[i] is the degree of vertex frontier[i]
+		frontier_vertex_degree = buffer_np1_1;
+		//exclusive sum of frontier_vertex_degree
+		exclusive_sum_frontier_vertex_degree = buffer_np1_2;
+
+		//Using buffers : bottom up
+		//contains list of unvisited vertices
+		unvisited_queue = buffer_np1_1;
+		//size of the "last" unvisited queue : size_last_unvisited_queue
+		//refers to the size of unvisited_queue
+		//which may not be up to date (the queue may contains vertices that are now visited)
+
+		//We may leave vertices unvisited after bottom up main kernels - storing them here
+		left_unvisited_queue = buffer_np1_2;
+
+		//We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
+		//See top down kernels for more details
+		cudaMalloc(&exclusive_sum_frontier_vertex_buckets_offsets,
+						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType));
+
+		//Init device-side counters
+		//Those counters must be/can be reset at each bfs iteration
+		//Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
+		cudaMalloc(&d_counters_pad, 4 * sizeof(IndexType));
+
+		d_new_frontier_cnt = &d_counters_pad[0];
+		d_mu = &d_counters_pad[1];
+		d_unvisited_cnt = &d_counters_pad[2];
+		d_left_unvisited_cnt = &d_counters_pad[3];
+
+		//Lets use this int* for the next 3 lines
+		//Its dereferenced value is not initialized - so we dont care about what we put in it
+		IndexType * d_nisolated = d_new_frontier_cnt;
+		cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
+
+		//Computing isolated_bmap
+		//Only dependent on graph - not source vertex - done once
+		flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
+		cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+
+		//We need nisolated to be ready to use
+		cudaStreamSynchronize(stream);
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::configure(IndexType *_distances,
+												IndexType *_predecessors,
+												int *_edge_mask)
+												{
+		distances = _distances;
+		predecessors = _predecessors;
+		edge_mask = _edge_mask;
+
+		useEdgeMask = (edge_mask != NULL);
+		computeDistances = (distances != NULL);
+		computePredecessors = (predecessors != NULL);
+
+		//We need distances to use bottom up
+		if (directed && !computeDistances)
+			cudaMalloc(&distances, n * sizeof(IndexType));
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::traverse(IndexType source_vertex) {
+
+		//Init visited_bmap
+		//If the graph is undirected, we not that
+		//we will never discover isolated vertices (in degree = out degree = 0)
+		//we avoid a lot of work by flagging them now
+		//in g500 graphs they represent ~25% of total vertices
+		//more than that for wiki and twitter graphs
+
+		if (directed) {
+			cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
+		}
+		else {
+			cudaMemcpyAsync(visited_bmap,
+									isolated_bmap,
+									vertices_bmap_size * sizeof(int),
+									cudaMemcpyDeviceToDevice,
+									stream);
+		}
+
+		//If needed, setting all vertices as undiscovered (inf distance)
+		//We dont use computeDistances here
+		//if the graph is undirected, we may need distances even if
+		//computeDistances is false
+		if (distances)
+			fill_vec(distances, n, vec_t<IndexType>::max, stream);
+
+		//If needed, setting all predecessors to non-existent (-1)
+		if (computePredecessors) {
+			cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
+		}
+
+		//
+		//Initial frontier
+		//
+
+		frontier = original_frontier;
+
+		if (distances) {
+			cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
+		}
+
+		//Setting source_vertex as visited
+		//There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
+		int current_visited_bmap_source_vert = 0;
+
+		if (!directed) {
+			cudaMemcpyAsync(&current_visited_bmap_source_vert,
+									&visited_bmap[source_vertex / INT_SIZE],
+									sizeof(int),
+									cudaMemcpyDeviceToHost);
+			//We need current_visited_bmap_source_vert
+			cudaStreamSynchronize(stream);
+		}
+
+		int m = (1 << (source_vertex % INT_SIZE));
+
+		//In that case, source is isolated, done now
+		if (!directed && (m & current_visited_bmap_source_vert)) {
+			//Init distances and predecessors are done, (cf Streamsync in previous if)
+			return;
+		}
+
+		m |= current_visited_bmap_source_vert;
+
+		cudaMemcpyAsync(&visited_bmap[source_vertex / INT_SIZE],
+								&m,
+								sizeof(int),
+								cudaMemcpyHostToDevice,
+								stream);
+
+		//Adding source_vertex to init frontier
+		cudaMemcpyAsync(&frontier[0],
+								&source_vertex,
+								sizeof(IndexType),
+								cudaMemcpyHostToDevice,
+								stream);
+
+		//mf : edges in frontier
+		//nf : vertices in frontier
+		//mu : edges undiscovered
+		//nu : nodes undiscovered
+		//lvl : current frontier's depth
+		IndexType mf, nf, mu, nu;
+		bool growing;
+		IndexType lvl = 1;
+
+		//Frontier has one vertex
+		nf = 1;
+
+		//all edges are undiscovered (by def isolated vertices have 0 edges)
+		mu = nnz;
+
+		//all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
+		//That number is wrong if source_vertex is also isolated - but it's not important
+		nu = n - nisolated - nf;
+
+		//Last frontier was 0, now it is 1
+		growing = true;
+
+		IndexType size_last_left_unvisited_queue = n; //we just need value > 0
+		IndexType size_last_unvisited_queue = 0; //queue empty
+
+		//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+		set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+		exclusive_sum(d_cub_exclusive_sum_storage,
+							cub_exclusive_sum_storage_bytes,
+							frontier_vertex_degree,
+							exclusive_sum_frontier_vertex_degree,
+							nf + 1,
+							stream);
+
+		cudaMemcpyAsync(&mf,
+								&exclusive_sum_frontier_vertex_degree[nf],
+								sizeof(IndexType),
+								cudaMemcpyDeviceToHost,
+								stream);
+
+		//We need mf
+		cudaStreamSynchronize(stream);
+
+		//At first we know we have to use top down
+		BFS_ALGO_STATE algo_state = TOPDOWN;
+
+		//useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
+		//undirected g : need parents to be in children's neighbors
+		bool can_use_bottom_up = !directed && distances;
+
+		while (nf > 0) {
+			//Each vertices can appear only once in the frontierer array - we know it will fit
+			new_frontier = frontier + nf;
+			IndexType old_nf = nf;
+			resetDevicePointers();
+
+			if (can_use_bottom_up) {
+				//Choosing algo
+				//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
+
+				switch (algo_state) {
+					case TOPDOWN:
+						if (mf > mu / alpha)
+							algo_state = BOTTOMUP;
+						break;
+					case BOTTOMUP:
+						if (!growing && nf < n / beta) {
+
+							//We need to prepare the switch back to top down
+							//We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
+							count_unvisited_edges(unvisited_queue,
+															size_last_unvisited_queue,
+															visited_bmap,
+															vertex_degree,
+															d_mu,
+															stream);
+
+							//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+							set_frontier_degree(frontier_vertex_degree,
+														frontier,
+														vertex_degree,
+														nf,
+														stream);
+							exclusive_sum(d_cub_exclusive_sum_storage,
+												cub_exclusive_sum_storage_bytes,
+												frontier_vertex_degree,
+												exclusive_sum_frontier_vertex_degree,
+												nf + 1,
+												stream);
+
+							cudaMemcpyAsync(&mf,
+													&exclusive_sum_frontier_vertex_degree[nf],
+													sizeof(IndexType),
+													cudaMemcpyDeviceToHost,
+													stream);
+
+							cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+
+							//We will need mf and mu
+							cudaStreamSynchronize(stream);
+							algo_state = TOPDOWN;
+						}
+						break;
+				}
+			}
+
+			//Executing algo
+
+			switch (algo_state) {
+				case TOPDOWN:
+					compute_bucket_offsets(exclusive_sum_frontier_vertex_degree,
+													exclusive_sum_frontier_vertex_buckets_offsets,
+													nf,
+													mf,
+													stream);
+					frontier_expand(row_offsets,
+											col_indices,
+											frontier,
+											nf,
+											mf,
+											lvl,
+											new_frontier,
+											d_new_frontier_cnt,
+											exclusive_sum_frontier_vertex_degree,
+											exclusive_sum_frontier_vertex_buckets_offsets,
+											visited_bmap,
+											distances,
+											predecessors,
+											edge_mask,
+											isolated_bmap,
+											directed,
+											stream,
+											deterministic);
+
+					mu -= mf;
+
+					cudaMemcpyAsync(&nf,
+											d_new_frontier_cnt,
+											sizeof(IndexType),
+											cudaMemcpyDeviceToHost,
+											stream);
+					cudaCheckError();
+
+					//We need nf
+					cudaStreamSynchronize(stream);
+
+					if (nf) {
+						//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+						set_frontier_degree(frontier_vertex_degree,
+													new_frontier,
+													vertex_degree,
+													nf,
+													stream);
+						exclusive_sum(d_cub_exclusive_sum_storage,
+											cub_exclusive_sum_storage_bytes,
+											frontier_vertex_degree,
+											exclusive_sum_frontier_vertex_degree,
+											nf + 1,
+											stream);
+						cudaMemcpyAsync(&mf,
+												&exclusive_sum_frontier_vertex_degree[nf],
+												sizeof(IndexType),
+												cudaMemcpyDeviceToHost,
+												stream);
+
+						//We need mf
+						cudaStreamSynchronize(stream);
+					}
+					break;
+
+				case BOTTOMUP:
+					fill_unvisited_queue(visited_bmap,
+												vertices_bmap_size,
+												n,
+												unvisited_queue,
+												d_unvisited_cnt,
+												stream,
+												deterministic);
+
+					size_last_unvisited_queue = nu;
+
+					bottom_up_main(unvisited_queue,
+										size_last_unvisited_queue,
+										left_unvisited_queue,
+										d_left_unvisited_cnt,
+										visited_bmap,
+										row_offsets,
+										col_indices,
+										lvl,
+										new_frontier,
+										d_new_frontier_cnt,
+										distances,
+										predecessors,
+										edge_mask,
+										stream,
+										deterministic);
+
+					//The number of vertices left unvisited decreases
+					//If it wasnt necessary last time, it wont be this time
+					if (size_last_left_unvisited_queue) {
+						cudaMemcpyAsync(&size_last_left_unvisited_queue,
+												d_left_unvisited_cnt,
+												sizeof(IndexType),
+												cudaMemcpyDeviceToHost,
+												stream);
+						cudaCheckError()
+						//We need last_left_unvisited_size
+						cudaStreamSynchronize(stream);
+						bottom_up_large(left_unvisited_queue,
+												size_last_left_unvisited_queue,
+												visited_bmap,
+												row_offsets,
+												col_indices,
+												lvl,
+												new_frontier,
+												d_new_frontier_cnt,
+												distances,
+												predecessors,
+												edge_mask,
+												stream,
+												deterministic);
+					}
+					cudaMemcpyAsync(&nf,
+											d_new_frontier_cnt,
+											sizeof(IndexType),
+											cudaMemcpyDeviceToHost,
+											stream);
+					cudaCheckError()
+
+					//We will need nf
+					cudaStreamSynchronize(stream);
+					break;
+			}
+
+			//Updating undiscovered edges count
+			nu -= nf;
+
+			//Using new frontier
+			frontier = new_frontier;
+			growing = (nf > old_nf);
+
+			++lvl;
+		}
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::resetDevicePointers() {
+		cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
+	}
+
+	template<typename IndexType>
+	void Bfs<IndexType>::clean() {
+		//the vectors have a destructor that takes care of cleaning
+		cudaFree(original_frontier);
+		cudaFree(visited_bmap);
+		cudaFree(isolated_bmap);
+		cudaFree(vertex_degree);
+		cudaFree(d_cub_exclusive_sum_storage);
+		cudaFree(buffer_np1_1);
+		cudaFree(buffer_np1_2);
+		cudaFree(exclusive_sum_frontier_vertex_buckets_offsets);
+		cudaFree(d_counters_pad);
+
+		//In that case, distances is a working data
+		if (directed && !computeDistances)
+			cudaFree(distances);
+	}
+
+	template class Bfs<int> ;
+} // end namespace cugraph
diff --git a/src/bfs.cuh b/src/bfs.cuh
new file mode 100755
index 00000000000..c665aabb6e3
--- /dev/null
+++ b/src/bfs.cuh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#pragma once
+
+#include <climits> 
+
+//Used in nvgraph.h
+
+#define TRAVERSAL_DEFAULT_ALPHA 15
+
+#define TRAVERSAL_DEFAULT_BETA 18
+
+namespace cugraph {
+	template<typename IndexType>
+	class Bfs {
+	private:
+		IndexType n, nnz;
+		IndexType* row_offsets;
+		IndexType* col_indices;
+
+		bool directed;
+		bool deterministic;
+
+		// edgemask, distances, predecessors are set/read by users - using Vectors
+		bool useEdgeMask;
+		bool computeDistances;
+		bool computePredecessors;
+		IndexType *distances;
+		IndexType *predecessors;
+		int *edge_mask;
+
+		//Working data
+		//For complete description of each, go to bfs.cu
+		IndexType nisolated;
+		IndexType *frontier, *new_frontier;
+		IndexType * original_frontier;
+		IndexType vertices_bmap_size;
+		int *visited_bmap, *isolated_bmap;
+		IndexType *vertex_degree;
+		IndexType *buffer_np1_1, *buffer_np1_2;
+		IndexType *frontier_vertex_degree;
+		IndexType *exclusive_sum_frontier_vertex_degree;
+		IndexType *unvisited_queue;
+		IndexType *left_unvisited_queue;
+		IndexType *exclusive_sum_frontier_vertex_buckets_offsets;
+		IndexType *d_counters_pad;
+		IndexType *d_new_frontier_cnt;
+		IndexType *d_mu;
+		IndexType *d_unvisited_cnt;
+		IndexType *d_left_unvisited_cnt;
+		void *d_cub_exclusive_sum_storage;
+		size_t cub_exclusive_sum_storage_bytes;
+
+		//Parameters for direction optimizing
+		IndexType alpha, beta;
+		cudaStream_t stream;
+
+		//resets pointers defined by d_counters_pad (see implem)
+		void resetDevicePointers();
+		void setup();
+		void clean();
+
+	public:
+		virtual ~Bfs(void) {
+			clean();
+		}
+
+		Bfs(	IndexType _n,
+				IndexType _nnz,
+				IndexType *_row_offsets,
+				IndexType *_col_indices,
+				bool _directed,
+				IndexType _alpha,
+				IndexType _beta,
+				cudaStream_t _stream = 0) :
+						n(_n),
+						nnz(_nnz),
+						row_offsets(_row_offsets),
+						col_indices(_col_indices),
+						directed(_directed),
+						alpha(_alpha),
+						beta(_beta),
+						stream(_stream) {
+			setup();
+		}
+
+		void configure(IndexType *distances, IndexType *predecessors, int *edge_mask);
+
+		void traverse(IndexType source_vertex);
+	};
+} // end namespace nvgraph
+
diff --git a/src/bfs_kernels.cuh b/src/bfs_kernels.cuh
new file mode 100644
index 00000000000..7322293bce1
--- /dev/null
+++ b/src/bfs_kernels.cuh
@@ -0,0 +1,1560 @@
+#include <iostream>
+
+#include <cub/cub.cuh>
+#include <utilities/sm_utils.h>
+
+#define MAXBLOCKS 65535
+#define WARP_SIZE 32
+#define INT_SIZE 32
+
+//
+// Bottom up macros
+//
+
+#define FILL_UNVISITED_QUEUE_DIMX 256
+
+#define COUNT_UNVISITED_EDGES_DIMX 256
+
+#define MAIN_BOTTOMUP_DIMX 256
+#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE)
+
+#define LARGE_BOTTOMUP_DIMX 256
+
+//Number of edges processed in the main bottom up kernel
+#define MAIN_BOTTOMUP_MAX_EDGES 6
+
+//Power of 2 < 32 (strict <)
+#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
+
+//
+// Top down macros
+//
+
+// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges
+#define TOP_DOWN_BUCKET_SIZE 32
+
+// DimX of the kernel
+#define TOP_DOWN_EXPAND_DIMX 256
+
+// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
+#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
+
+// How many items_per_thread we can process with one bucket_offset loading
+// the -1 is here because we need the +1 offset
+#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
+
+// instruction parallelism
+// for how many edges will we create instruction parallelism
+#define TOP_DOWN_BATCH_SIZE 2
+
+#define COMPUTE_BUCKET_OFFSETS_DIMX 512
+
+//Other macros
+
+#define FLAG_ISOLATED_VERTICES_DIMX 128
+
+//Number of vertices handled by one thread
+//Must be power of 2, lower than 32
+#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 
+
+//Number of threads involved in the "construction" of one int in the bitset
+#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
+
+//
+// Parameters of the heuristic to switch between bottomup/topdown
+//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf 
+//
+
+using namespace cugraph;
+
+namespace bfs_kernels {
+	//
+	// gives the equivalent vectors from a type
+	// for the max val, would be better to use numeric_limits<>::max() once
+	// cpp11 is allowed in nvgraph
+	//
+
+	template<typename >
+	struct vec_t {
+		typedef int4 vec4;
+		typedef int2 vec2;
+	};
+
+	template<>
+	struct vec_t<int> {
+		typedef int4 vec4;
+		typedef int2 vec2;
+		static const int max = INT_MAX;
+	};
+
+	template<>
+	struct vec_t<long long int> {
+		typedef longlong4 vec4;
+		typedef longlong2 vec2;
+		static const long long int max = LLONG_MAX;
+	};
+
+	//
+	// ------------------------- Helper device functions -------------------
+	//
+
+	__forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+		if (n == INT_SIZE)
+			return (~0);
+		int mask = (1 << n) - 1;
+		return mask;
+	}
+
+	__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+		if (n == 0)
+			return 0;
+		int mask = ~((1 << (INT_SIZE - n)) - 1);
+		return mask;
+	}
+
+	__forceinline__ __device__ int getNextZeroBit(int& val) {
+		int ibit = __ffs(~val) - 1;
+		val |= (1 << ibit);
+
+		return ibit;
+	}
+
+	struct BitwiseAnd
+	{
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+																			{
+			return (a & b);
+		}
+	};
+
+	struct BitwiseOr
+	{
+		template<typename T>
+		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+																			{
+			return (a | b);
+		}
+	};
+
+	template<typename IndexType>
+	__device__ IndexType binsearch_maxle(	const IndexType *vec,
+														const IndexType val,
+														IndexType low,
+														IndexType high) {
+		while (true) {
+			if (low == high)
+				return low; //we know it exists
+			if ((low + 1) == high)
+				return (vec[high] <= val) ? high : low;
+
+			IndexType mid = low + (high - low) / 2;
+
+			if (vec[mid] > val)
+				high = mid - 1;
+			else
+				low = mid;
+
+		}
+	}
+
+	//
+	//  -------------------------  Bottom up -------------------------
+	//
+
+	//
+	// fill_unvisited_queue_kernel
+	//
+	// Finding unvisited vertices in the visited_bmap, and putting them in the queue
+	// Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
+	// For instance, the queue can look like this :
+	// 34 38 45 58 61 4 18 24 29 71 84 85 90
+	// Because they are represented by those ints in the bitmap :
+	// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
+
+	//visited_bmap_nints = the visited_bmap is made of that number of ints
+
+	template<typename IndexType>
+	__global__ void fill_unvisited_queue_kernel(	int *visited_bmap,
+																IndexType visited_bmap_nints,
+																IndexType n,
+																IndexType *unvisited,
+																IndexType *unvisited_cnt) {
+		typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
+		__shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+		//When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
+		//We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
+		//unvisited_common_block_offset
+		__shared__ IndexType unvisited_common_block_offset;
+
+		//We don't want threads divergence in the loop (we're going to call __syncthreads)
+		//Using a block-only dependent in the condition of the loop
+		for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+				block_v_idx < visited_bmap_nints;
+				block_v_idx += blockDim.x * gridDim.x) {
+
+			//Index of visited_bmap that this thread will compute
+			IndexType v_idx = block_v_idx + threadIdx.x;
+
+			int thread_visited_int = (v_idx < visited_bmap_nints)
+												? visited_bmap[v_idx]
+													:
+													(~0); //will be neutral in the next lines (virtual vertices all visited)
+
+			//The last int can only be partially valid
+			//If we are indeed taking care of the last visited int in this thread,
+			//We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
+			if (v_idx == (visited_bmap_nints - 1)) {
+				int active_bits = n - (INT_SIZE * v_idx);
+				int inactive_bits = INT_SIZE - active_bits;
+				int mask = getMaskNLeftmostBitSet(inactive_bits);
+				thread_visited_int |= mask; //Setting inactive bits as visited
+			}
+
+			//Counting number of unvisited vertices represented by this int
+			int n_unvisited_in_int = __popc(~thread_visited_int);
+			int unvisited_thread_offset;
+
+			//We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+			//We ask for that space when computing the block scan, that will tell where to write those
+			//vertices in the queue, using the common offset of the block (see below)
+			BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
+
+			//Last thread knows how many vertices will be written to the queue by this block
+			//Asking for that space in the queue using the global count, and saving the common offset
+			if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
+				IndexType total = unvisited_thread_offset + n_unvisited_in_int;
+				unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
+			}
+
+			//syncthreads for two reasons : 
+			// - we need to broadcast unvisited_common_block_offset
+			// - we will reuse scan_temp_storage (cf CUB doc)
+			__syncthreads();
+
+			IndexType current_unvisited_index = unvisited_common_block_offset
+					+ unvisited_thread_offset;
+			int nvertices_to_write = n_unvisited_in_int;
+
+			// getNextZeroBit uses __ffs, which gives least significant bit set
+			// which means that as long as n_unvisited_in_int is valid,
+			// we will use valid bits
+
+			while (nvertices_to_write > 0) {
+				if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
+					typename vec_t<IndexType>::vec4 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
+							IndexType>::vec4*>(&unvisited[current_unvisited_index]);
+					*unvisited_i4 = vec_v;
+
+					current_unvisited_index += 4;
+					nvertices_to_write -= 4;
+				}
+				else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
+					typename vec_t<IndexType>::vec2 vec_v;
+
+					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
+							IndexType>::vec2*>(&unvisited[current_unvisited_index]);
+					*unvisited_i2 = vec_v;
+
+					current_unvisited_index += 2;
+					nvertices_to_write -= 2;
+				} else {
+					IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+					unvisited[current_unvisited_index] = v;
+
+					current_unvisited_index += 1;
+					nvertices_to_write -= 1;
+				}
+
+			}
+		}
+	}
+
+	//Wrapper
+	template<typename IndexType>
+	void fill_unvisited_queue(	int *visited_bmap,
+										IndexType visited_bmap_nints,
+										IndexType n,
+										IndexType *unvisited,
+										IndexType *unvisited_cnt,
+										cudaStream_t m_stream,
+										bool deterministic) {
+		dim3 grid, block;
+		block.x = FILL_UNVISITED_QUEUE_DIMX;
+
+		grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
+
+		fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(	visited_bmap,
+																						visited_bmap_nints,
+																						n,
+																						unvisited,
+																						unvisited_cnt);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// count_unvisited_edges_kernel
+	// Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
+	// We need the current unvisited vertices to be in the unvisited queue
+	// But visited vertices can be in the potentially_unvisited queue
+	// We first check if the vertex is still unvisited before using it
+	// Useful when switching from "Bottom up" to "Top down"
+	//
+
+	template<typename IndexType>
+	__global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
+																const IndexType potentially_unvisited_size,
+																const int *visited_bmap,
+																IndexType *degree_vertices,
+																IndexType *mu) {
+		typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
+		__shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+
+		//number of undiscovered edges counted by this thread
+		IndexType thread_unvisited_edges_count = 0;
+
+		for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+				idx < potentially_unvisited_size;
+				idx += blockDim.x * gridDim.x) {
+
+			IndexType u = potentially_unvisited[idx];
+			int u_visited_bmap = visited_bmap[u / INT_SIZE];
+			int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
+
+			if (!is_visited)
+				thread_unvisited_edges_count += degree_vertices[u];
+
+		}
+
+		//We need all thread_unvisited_edges_count to be ready before reducing
+		__syncthreads();
+
+		IndexType block_unvisited_edges_count =
+				BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
+
+		//block_unvisited_edges_count is only defined is th.x == 0
+		if (threadIdx.x == 0)
+			atomicAdd(mu, block_unvisited_edges_count);
+	}
+
+	//Wrapper
+	template<typename IndexType>
+	void count_unvisited_edges(const IndexType *potentially_unvisited,
+										const IndexType potentially_unvisited_size,
+										const int *visited_bmap,
+										IndexType *node_degree,
+										IndexType *mu,
+										cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = COUNT_UNVISITED_EDGES_DIMX;
+		grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
+
+		count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(	potentially_unvisited,
+																						potentially_unvisited_size,
+																						visited_bmap,
+																						node_degree,
+																						mu);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// Main Bottom Up kernel
+	// Here we will start to process unvisited vertices in the unvisited queue
+	// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
+	// If it's not possible to define a valid parent using only those edges,
+	// add it to the "left_unvisited_queue"
+	//
+
+	//
+	// We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
+	// It is used to do a reduction locally and fully build the new visited_bmap
+	//
+
+	template<typename IndexType>
+	__global__ void main_bottomup_kernel(	const IndexType *unvisited,
+														const IndexType unvisited_size,
+														IndexType *left_unvisited,
+														IndexType *left_unvisited_cnt,
+														int *visited_bmap,
+														const IndexType *row_ptr,
+														const IndexType *col_ind,
+														IndexType lvl,
+														IndexType *new_frontier,
+														IndexType *new_frontier_cnt,
+														IndexType *distances,
+														IndexType *predecessors,
+														int *edge_mask) {
+		typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
+		typedef cub::WarpReduce<int> WarpReduce;
+		typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
+
+		__shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
+		__shared__ typename WarpReduce::TempStorage reduce_temp_storage;
+		__shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+		//To write vertices in the frontier,
+		//We will use a block scan to locally compute the offsets
+		//frontier_common_block_offset contains the common offset for the block
+		__shared__ IndexType frontier_common_block_offset;
+
+		// When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
+		// from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
+		// vertices represented by the same int will be designed as part of the same "group"
+		// To detect the deliminations between those groups, we use BlockDiscontinuity
+		// Then we need to create the new "visited_bmap" within those group.
+		// We use a warp reduction that takes into account limits between groups to do it
+		// But a group can be cut in two different warps : in that case, the second warp
+		// put the result of its local reduction in local_visited_bmap_warp_head
+		// the first warp will then read it and finish the reduction
+
+		__shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
+
+		const int warpid = threadIdx.x / WARP_SIZE;
+		const int laneid = threadIdx.x % WARP_SIZE;
+
+		// we will call __syncthreads inside the loop
+		// we need to keep complete block active
+		for (IndexType block_off = blockIdx.x * blockDim.x;
+				block_off < unvisited_size;
+				block_off += blockDim.x * gridDim.x)
+						{
+			IndexType idx = block_off + threadIdx.x;
+
+			// This thread will take care of unvisited_vertex
+			// in the visited_bmap, it is represented by the int at index
+			// visited_bmap_index = unvisited_vertex/INT_SIZE
+			// it will be used by BlockDiscontinuity
+			// to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
+			IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
+			visited_bmap_index[0] = -1;
+			IndexType unvisited_vertex = -1;
+
+			// local_visited_bmap gives info on the visited bit of unvisited_vertex
+			//
+			// By default, everything is visited
+			// This is because we only take care of unvisited vertices here,
+			// The other are by default unvisited
+			// If a vertex remain unvisited, we will notice it here
+			// That's why by default we consider everything visited ( ie ~0 )
+			// If we fail to assign one parent to an unvisited vertex, we will
+			// explicitly unset the bit
+			int local_visited_bmap = (~0);
+			int found = 0;
+			int more_to_visit = 0;
+			IndexType valid_parent;
+			IndexType left_unvisited_off;
+
+			if (idx < unvisited_size)
+					{
+				//Processing first STPV edges of unvisited v
+				//If bigger than that, push to left_unvisited queue
+				unvisited_vertex = unvisited[idx];
+
+				IndexType edge_begin = row_ptr[unvisited_vertex];
+				IndexType edge_end = row_ptr[unvisited_vertex + 1];
+
+				visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
+
+				IndexType degree = edge_end - edge_begin;
+
+				for (IndexType edge = edge_begin;
+						edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
+						{
+					if (edge_mask && !edge_mask[edge])
+						continue;
+
+					IndexType parent_candidate = col_ind[edge];
+
+					if (distances[parent_candidate] == (lvl - 1))
+							{
+						found = 1;
+						valid_parent = parent_candidate;
+						break;
+					}
+				}
+
+				// This vertex will remain unvisited at the end of this kernel
+				// Explicitly say it
+				if (!found)
+					local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
+				else
+				{
+					if (distances)
+						distances[unvisited_vertex] = lvl;
+					if (predecessors)
+						predecessors[unvisited_vertex] = valid_parent;
+				}
+
+				//If we haven't found a parent and there's more edge to check
+				if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
+				{
+					left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1);
+					more_to_visit = 1;
+				}
+
+			}
+
+			//
+			// We will separate vertices in group
+			// Two vertices are in the same group if represented by same int in visited_bmap
+			// ie u and v in same group <=> u/32 == v/32
+			//
+			// We will now flag the head of those group (first element of each group)
+			//
+			// 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
+			// 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
+			// at most by two warps
+
+			int is_head_a[1]; //CUB need an array
+			BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
+																						visited_bmap_index,
+																						cub::Inequality());
+			int is_head = is_head_a[0];
+
+			// Computing the warp reduce within group
+			// This primitive uses the is_head flags to know where the limits of the groups are
+			// We use bitwise and as operator, because of the fact that 1 is the default value
+			// If a vertex is unvisited, we have to explicitly ask for it
+			int local_bmap_agg =
+					WarpReduce(reduce_temp_storage).HeadSegmentedReduce(	local_visited_bmap,
+																							is_head,
+																							BitwiseAnd());
+
+			// We need to take care of the groups cut in two in two different warps
+			// Saving second part of the reduce here, then applying it on the first part bellow
+			// Corner case : if the first thread of the warp is a head, then this group is not cut in two
+			// and then we have to be neutral (for an bitwise and, it's an ~0)
+			if (laneid == 0)
+					{
+				local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
+			}
+
+			//broadcasting local_visited_bmap_warp_head
+			__syncthreads();
+
+			int head_ballot = cugraph::utils::ballot(is_head);
+
+			//As long as idx < unvisited_size, we know there's at least one head per warp
+			int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
+
+			int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
+
+			// if laneid == 0 && is_last_head_in_warp, it's a special case where
+			// a group of size 32 starts exactly at lane 0
+			// in that case, nothing to do (this group is not cut by a warp delimitation)
+			// we also have to make sure that a warp actually exists after this one (this corner case is handled after)
+			if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS))
+			{
+				local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
+			}
+
+			//Three cases :
+			// -> This is the first group of the block - it may be cut in two (with previous block)
+			// -> This is the last group of the block - same thing
+			// -> This group is completely contained in this block
+
+			if (warpid == 0 && laneid == 0)
+					{
+				//The first elt of this group considered in this block is unvisited_vertex
+				//We know that's the case because elts are sorted in a group, and we are at laneid == 0
+				//We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
+				int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
+				int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
+				local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
+				atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+			}
+			else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
+					laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
+					idx < unvisited_size //we could be out
+							)
+							{
+				//Last head of the block
+				//We don't know if this group is complete
+
+				//last_v is the last unvisited_vertex of the group IN THIS block
+				//we dont know about the rest - we have to be neutral about elts > last_v
+
+				//the destination thread of the __shfl is active
+				int laneid_max = min((IndexType) (WARP_SIZE - 1),
+											(unvisited_size - (block_off + 32 * warpid)));
+				IndexType last_v = cugraph::utils::shfl(	unvisited_vertex,
+																		laneid_max,
+																		WARP_SIZE,
+																		__activemask());
+
+				if (is_last_head_in_warp)
+				{
+					int ilast_v = last_v % INT_SIZE + 1;
+					int mask = getMaskNRightmostBitSet(ilast_v);
+					local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
+					atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+				}
+			}
+			else
+			{
+				//group completely in block
+				if (is_head && idx < unvisited_size) {
+					visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
+				}
+			}
+
+			//Saving in frontier
+
+			int thread_frontier_offset;
+			BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
+			IndexType inclusive_sum = thread_frontier_offset + found;
+			if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
+					{
+				frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+			}
+
+			//1) Broadcasting frontier_common_block_offset
+			//2) we want to reuse the *_temp_storage
+			__syncthreads();
+
+			if (found)
+				new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
+			if (more_to_visit)
+				left_unvisited[left_unvisited_off] = unvisited_vertex;
+
+		}
+	}
+
+	template<typename IndexType>
+	void bottom_up_main(	IndexType *unvisited,
+								IndexType unvisited_size,
+								IndexType *left_unvisited,
+								IndexType *d_left_unvisited_idx,
+								int *visited,
+								const IndexType *row_ptr,
+								const IndexType *col_ind,
+								IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_idx,
+								IndexType *distances,
+								IndexType *predecessors,
+								int *edge_mask,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		dim3 grid, block;
+		block.x = MAIN_BOTTOMUP_DIMX;
+
+		grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
+
+		main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
+																			unvisited_size,
+																			left_unvisited,
+																			d_left_unvisited_idx,
+																			visited,
+																			row_ptr,
+																			col_ind,
+																			lvl,
+																			new_frontier,
+																			new_frontier_idx,
+																			distances,
+																			predecessors,
+																			edge_mask);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// bottom_up_large_degree_kernel
+	// finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
+	//
+	template<typename IndexType>
+	__global__ void bottom_up_large_degree_kernel(	IndexType *left_unvisited,
+																	IndexType left_unvisited_size,
+																	int *visited,
+																	const IndexType *row_ptr,
+																	const IndexType *col_ind,
+																	IndexType lvl,
+																	IndexType *new_frontier,
+																	IndexType *new_frontier_cnt,
+																	IndexType *distances,
+																	IndexType *predecessors,
+																	int *edge_mask) {
+
+		int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
+		int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+		int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+
+		//Inactive threads are not a pb for __ballot (known behaviour)
+		for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
+				idx < left_unvisited_size;
+				idx += gridDim.x * logical_warps_per_block) {
+
+			//Unvisited vertices - potentially in the next frontier
+			IndexType v = left_unvisited[idx];
+
+			//Used only with symmetric graphs
+			//Parents are included in v's neighbors
+			IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
+
+			IndexType end_i_edge = row_ptr[v + 1];
+
+			//We can have warp divergence in the next loop
+			//It's not a pb because the behaviour of __ballot
+			//is know with inactive threads
+			for (IndexType i_edge = first_i_edge + logical_lane_id;
+					i_edge < end_i_edge;
+					i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
+
+				IndexType valid_parent = -1;
+
+				if (!edge_mask || edge_mask[i_edge]) {
+					IndexType u = col_ind[i_edge];
+					IndexType lvl_u = distances[u];
+
+					if (lvl_u == (lvl - 1)) {
+						valid_parent = u;
+					}
+				}
+
+				unsigned int warp_valid_p_ballot = cugraph::utils::ballot((valid_parent != -1));
+
+				int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
+				unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
+				unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
+						>> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
+				logical_warp_valid_p_ballot &= mask;
+
+				int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
+
+				if (chosen_thread == logical_lane_id) {
+					//Using only one valid parent (reduce bw)
+					IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
+					int m = 1 << (v % INT_SIZE);
+					atomicOr(&visited[v / INT_SIZE], m);
+					distances[v] = lvl;
+
+					if (predecessors)
+						predecessors[v] = valid_parent;
+
+					new_frontier[off] = v;
+				}
+
+				if (logical_warp_valid_p_ballot) {
+					break;
+				}
+			}
+
+		}
+	}
+
+	template<typename IndexType>
+	void bottom_up_large(IndexType *left_unvisited,
+								IndexType left_unvisited_size,
+								int *visited,
+								const IndexType *row_ptr,
+								const IndexType *col_ind,
+								IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_idx,
+								IndexType *distances,
+								IndexType *predecessors,
+								int *edge_mask,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		dim3 grid, block;
+		block.x = LARGE_BOTTOMUP_DIMX;
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
+
+		bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
+																						left_unvisited_size,
+																						visited,
+																						row_ptr,
+																						col_ind,
+																						lvl,
+																						new_frontier,
+																						new_frontier_idx,
+																						distances,
+																						predecessors,
+																						edge_mask);
+		cudaCheckError()
+		;
+	}
+
+	//
+	//
+	//  ------------------------------ Top down ------------------------------
+	//
+	//
+
+	//
+	// compute_bucket_offsets_kernel
+	// simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
+	//
+
+	template<typename IndexType>
+	__global__ void compute_bucket_offsets_kernel(	const IndexType *frontier_degrees_exclusive_sum,
+																	IndexType *bucket_offsets,
+																	const IndexType frontier_size,
+																	IndexType total_degree) {
+		IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+				* NBUCKETS_PER_BLOCK + 1);
+
+		for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+				bid <= end;
+				bid += gridDim.x * blockDim.x) {
+
+			IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+			bucket_offsets[bid] = binsearch_maxle(	frontier_degrees_exclusive_sum,
+																eid,
+																(IndexType) 0,
+																frontier_size - 1);
+
+		}
+	}
+
+	template<typename IndexType>
+	void compute_bucket_offsets(	IndexType *cumul,
+											IndexType *bucket_offsets,
+											IndexType frontier_size,
+											IndexType total_degree,
+											cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+									* NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+		compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
+																						bucket_offsets,
+																						frontier_size,
+																						total_degree);
+		cudaCheckError()
+		;
+	}
+
+	//
+	// topdown_expand_kernel
+	// Read current frontier and compute new one with top down paradigm
+	// One thread = One edge
+	// To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
+	// This index k will give us the origin of this edge, which is frontier[k]
+	// This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
+	//
+	// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
+	// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
+	//
+	// Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
+	// To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
+	// We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
+	//
+	// We will then look which vertices are not visited yet :
+	// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
+	// 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
+	//
+	// We then treat the candidates queue using the threadIdx.x < ncandidates
+	// If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
+	// We add it to the new frontier
+	//
+
+	template<typename IndexType>
+	__global__ void topdown_expand_kernel(	const IndexType *row_ptr,
+														const IndexType *col_ind,
+														const IndexType *frontier,
+														const IndexType frontier_size,
+														const IndexType totaldegree,
+														const IndexType max_items_per_thread,
+														const IndexType lvl,
+														IndexType *new_frontier,
+														IndexType *new_frontier_cnt,
+														const IndexType *frontier_degrees_exclusive_sum,
+														const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+														int *bmap,
+														IndexType *distances,
+														IndexType *predecessors,
+														const int *edge_mask,
+														const int *isolated_bmap,
+														bool directed) {
+		//BlockScan
+		typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
+		__shared__ typename BlockScan::TempStorage scan_storage;
+
+		// We will do a scan to know where to write in frontier
+		// This will contain the common offset of the block
+		__shared__ IndexType frontier_common_block_offset;
+
+		__shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+		__shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+		//
+		// Frontier candidates local queue
+		// We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
+		// We also save the predecessors here, because we will not be able to retrieve it after
+		//
+		__shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
+				* TOP_DOWN_EXPAND_DIMX];
+		__shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
+				* TOP_DOWN_EXPAND_DIMX];
+		__shared__ IndexType block_n_frontier_candidates;
+
+		IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+		IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+				/ TOP_DOWN_EXPAND_DIMX;
+
+		n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+		for (;
+				(n_items_per_thread_left > 0) && (block_offset < totaldegree);
+
+				block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+						n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+			// In this loop, we will process batch_set_size batches
+			IndexType nitems_per_thread = min(	n_items_per_thread_left,
+															(IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+			// Loading buckets offset (see compute_bucket_offsets_kernel)
+
+			if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+				shared_buckets_offsets[threadIdx.x] =
+						frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+								+ threadIdx.x];
+
+			// We will use shared_buckets_offsets
+			__syncthreads();
+
+			//
+			// shared_buckets_offsets gives us a range of the possible indexes
+			// for edge of linear_threadx, we are looking for the value k such as
+			// k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+			//
+			// we have 0 <= k < frontier_size
+			// but we also have :
+			//
+			// frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+			// <= k
+			// <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+			//
+			// To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+			// We will load them here
+			// We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+			// Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+			//We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+			//If it doesn't fit, --right until it does, then loop
+			//It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+			IndexType left = 0;
+			IndexType right = nitems_per_thread;
+
+			while (left < nitems_per_thread) {
+				//
+				// Values that are necessary to compute the local binary searches
+				// We only need those with indexes between extremes indexes of buckets_offsets
+				// We need the next val for the binary search, hence the +1
+				//
+
+				IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+						- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+				//If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+				while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+					--right;
+
+					nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+							- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+				}
+
+				IndexType nitems_per_thread_for_this_load = right - left;
+
+				IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+						* NBUCKETS_PER_BLOCK];
+
+				if (threadIdx.x < nvalues_to_load) {
+					shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ threadIdx.x];
+				}
+
+				if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+					shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+									+ TOP_DOWN_EXPAND_DIMX];
+				}
+
+				//shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+				__syncthreads();
+
+				// Now we will process the edges
+				// Here each thread will process nitems_per_thread_for_this_load
+				for (IndexType item_index = 0;
+						item_index < nitems_per_thread_for_this_load;
+						item_index += TOP_DOWN_BATCH_SIZE) {
+
+					// We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+					// Reduces latency
+
+					IndexType current_max_edge_index = min(block_offset
+																				+ (left
+																						+ nitems_per_thread_for_this_load)
+																						* blockDim.x,
+																		totaldegree);
+
+					//We will need vec_u (source of the edge) until the end if we need to save the predecessors
+					//For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
+
+					IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+					IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+					IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+
+#pragma unroll
+					for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+						IndexType ibatch = left + item_index + iv;
+						IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+						if (gid < current_max_edge_index) {
+							IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+									/ TOP_DOWN_BUCKET_SIZE;
+							IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+									- frontier_degrees_exclusive_sum_block_offset;
+							IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+									- frontier_degrees_exclusive_sum_block_offset;
+
+							IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+																	gid,
+																	bucket_start,
+																	bucket_end)
+									+ frontier_degrees_exclusive_sum_block_offset;
+							vec_u[iv] = frontier[k]; // origin of this edge
+							vec_frontier_degrees_exclusive_sum_index[iv] =
+									frontier_degrees_exclusive_sum[k];
+						} else {
+							vec_u[iv] = -1;
+							vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+						}
+
+					}
+
+					IndexType *vec_row_ptr_u = &local_buf1[0];
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType u = vec_u[iv];
+						//row_ptr for this vertex origin u
+						vec_row_ptr_u[iv] = (u != -1)
+													? row_ptr[u]
+														:
+														-1;
+					}
+
+					//We won't need row_ptr after that, reusing pointer
+					IndexType *vec_dest_v = vec_row_ptr_u;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType thread_item_index = left + item_index + iv;
+						IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+
+						IndexType row_ptr_u = vec_row_ptr_u[iv];
+						IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+
+						if (edge_mask && !edge_mask[edge])
+							row_ptr_u = -1; //disabling edge
+
+						//Destination of this edge
+						vec_dest_v[iv] = (row_ptr_u != -1)
+												? col_ind[edge]
+													:
+													-1;
+					}
+
+					//We don't need vec_frontier_degrees_exclusive_sum_index anymore
+					IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_dest_v[iv];
+						vec_v_visited_bmap[iv] = (v != -1)
+															? bmap[v / INT_SIZE]
+																:
+																(~0); //will look visited
+					}
+
+					// From now on we will consider v as a frontier candidate
+					// If for some reason vec_candidate[iv] should be put in the new_frontier
+					// Then set vec_candidate[iv] = -1
+					IndexType *vec_frontier_candidate = vec_dest_v;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						int m = 1 << (v % INT_SIZE);
+
+						int is_visited = vec_v_visited_bmap[iv] & m;
+
+						if (is_visited)
+							vec_frontier_candidate[iv] = -1;
+					}
+
+					if (directed) {
+						//vec_v_visited_bmap is available
+
+						IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
+
+#pragma unroll
+						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+							IndexType v = vec_frontier_candidate[iv];
+							vec_is_isolated_bmap[iv] = (v != -1)
+																? isolated_bmap[v / INT_SIZE]
+																	:
+																	-1;
+						}
+
+#pragma unroll
+						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+							IndexType v = vec_frontier_candidate[iv];
+							int m = 1 << (v % INT_SIZE);
+							int is_isolated = vec_is_isolated_bmap[iv] & m;
+
+							//If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
+							// 1st reason : it's useless
+							// 2nd reason : it will make top down algo fail
+							// we need each node in frontier to have a degree > 0
+							// If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
+
+							if (is_isolated && v != -1) {
+								int m = 1 << (v % INT_SIZE);
+								atomicOr(&bmap[v / INT_SIZE], m);
+								if (distances)
+									distances[v] = lvl;
+
+								if (predecessors)
+									predecessors[v] = vec_u[iv];
+
+								//This is no longer a candidate, neutralize it
+								vec_frontier_candidate[iv] = -1;
+							}
+
+						}
+					}
+
+					//Number of successor candidate hold by this thread
+					IndexType thread_n_frontier_candidates = 0;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						IndexType v = vec_frontier_candidate[iv];
+						if (v != -1)
+							++thread_n_frontier_candidates;
+					}
+
+					// We need to have all nfrontier_candidates to be ready before doing the scan
+					__syncthreads();
+
+					// We will put the frontier candidates in a local queue
+					// Computing offsets
+					IndexType thread_frontier_candidate_offset = 0; //offset inside block
+					BlockScan(scan_storage).ExclusiveSum(	thread_n_frontier_candidates,
+																		thread_frontier_candidate_offset);
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						//May have bank conflicts
+						IndexType frontier_candidate = vec_frontier_candidate[iv];
+
+						if (frontier_candidate != -1) {
+							shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
+									frontier_candidate;
+							shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
+									vec_u[iv];
+							++thread_frontier_candidate_offset;
+						}
+					}
+
+					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+						//No need to add nsuccessor_candidate, even if its an
+						//exclusive sum
+						//We incremented the thread_frontier_candidate_offset
+						block_n_frontier_candidates = thread_frontier_candidate_offset;
+					}
+
+					//broadcast block_n_frontier_candidates
+					__syncthreads();
+
+					IndexType naccepted_vertices = 0;
+					//We won't need vec_frontier_candidate after that
+					IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						const int idx_shared = iv * blockDim.x + threadIdx.x;
+						vec_frontier_accepted_vertex[iv] = -1;
+
+						if (idx_shared < block_n_frontier_candidates) {
+							IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
+							int m = 1 << (v % INT_SIZE);
+							int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
+
+							if (!(m & q)) { //if this thread was the first to discover this node
+								if (distances)
+									distances[v] = lvl;
+
+								if (predecessors) {
+									IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
+									predecessors[v] = pred;
+								}
+
+								vec_frontier_accepted_vertex[iv] = v;
+								++naccepted_vertices;
+							}
+						}
+
+					}
+
+					//We need naccepted_vertices to be ready
+					__syncthreads();
+
+					IndexType thread_new_frontier_offset;
+
+					BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
+
+					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+
+						IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
+						//for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
+						if (inclusive_sum)
+							frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+					}
+
+					//Broadcasting frontier_common_block_offset
+					__syncthreads();
+
+#pragma unroll
+					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+						const int idx_shared = iv * blockDim.x + threadIdx.x;
+						if (idx_shared < block_n_frontier_candidates) {
+
+							IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
+
+							if (new_frontier_vertex != -1) {
+								IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
+								new_frontier[off] = new_frontier_vertex;
+							}
+						}
+					}
+
+				}
+
+				//We need to keep shared_frontier_degrees_exclusive_sum coherent
+				__syncthreads();
+
+				//Preparing for next load
+				left = right;
+				right = nitems_per_thread;
+			}
+
+			//we need to keep shared_buckets_offsets coherent
+			__syncthreads();
+		}
+
+	}
+
+	template<typename IndexType>
+	void frontier_expand(const IndexType *row_ptr,
+								const IndexType *col_ind,
+								const IndexType *frontier,
+								const IndexType frontier_size,
+								const IndexType totaldegree,
+								const IndexType lvl,
+								IndexType *new_frontier,
+								IndexType *new_frontier_cnt,
+								const IndexType *frontier_degrees_exclusive_sum,
+								const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+								int *visited_bmap,
+								IndexType *distances,
+								IndexType *predecessors,
+								const int *edge_mask,
+								const int *isolated_bmap,
+								bool directed,
+								cudaStream_t m_stream,
+								bool deterministic) {
+		if (!totaldegree)
+			return;
+
+		dim3 block;
+		block.x = TOP_DOWN_EXPAND_DIMX;
+
+		IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+				/ (MAXBLOCKS * block.x);
+
+		dim3 grid;
+		grid.x = min(	(totaldegree + max_items_per_thread * block.x - 1)
+									/ (max_items_per_thread * block.x),
+							(IndexType) MAXBLOCKS);
+
+		topdown_expand_kernel<<<grid, block, 0, m_stream>>>(	row_ptr,
+																				col_ind,
+																				frontier,
+																				frontier_size,
+																				totaldegree,
+																				max_items_per_thread,
+																				lvl,
+																				new_frontier,
+																				new_frontier_cnt,
+																				frontier_degrees_exclusive_sum,
+																				frontier_degrees_exclusive_sum_buckets_offsets,
+																				visited_bmap,
+																				distances,
+																				predecessors,
+																				edge_mask,
+																				isolated_bmap,
+																				directed);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	__global__ void flag_isolated_vertices_kernel(	IndexType n,
+																	int *isolated_bmap,
+																	const IndexType *row_ptr,
+																	IndexType *degrees,
+																	IndexType *nisolated) {
+		typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+		typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+		typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
+		typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
+
+		__shared__ typename BlockLoad::TempStorage load_temp_storage;
+		__shared__ typename BlockStore::TempStorage store_temp_storage;
+		__shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
+
+		__shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
+				/ FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
+
+		__shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
+
+		for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
+				* (blockDim.x * blockIdx.x);
+				block_off < n;
+				block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
+
+			IndexType thread_off = block_off
+					+ FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
+			IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
+
+			IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+			IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
+
+			BlockLoad(load_temp_storage).Load(	row_ptr + block_off,
+															thread_row_ptr,
+															block_valid_items,
+															-1);
+
+			//To compute 4 degrees, we need 5 values of row_ptr
+			//Saving the "5th" value in shared memory for previous thread to use
+			if (threadIdx.x > 0) {
+				row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
+			}
+
+			//If this is the last thread, it needs to load its row ptr tail value
+			if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
+				row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
+
+			}
+			__syncthreads(); // we may reuse temp_storage
+
+			int local_isolated_bmap = 0;
+
+			IndexType imax = (n - thread_off);
+
+			IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+
+#pragma unroll
+			for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
+				IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
+
+				if (i < imax)
+					local_isolated_bmap |= ((degree == 0) << i);
+			}
+
+			if (last_node_thread < n) {
+				IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
+						row_ptr_tail[threadIdx.x]
+								- thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
+
+				local_isolated_bmap |= ((degree == 0)
+						<< (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
+
+			}
+
+			local_isolated_bmap <<= (thread_off % INT_SIZE);
+
+			IndexType local_nisolated = __popc(local_isolated_bmap);
+
+			//We need local_nisolated and local_isolated_bmap to be ready for next steps
+			__syncthreads();
+
+			IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
+
+			if (threadIdx.x == 0 && total_nisolated) {
+				atomicAdd(nisolated, total_nisolated);
+			}
+
+			int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
+
+			//Building int for bmap
+			int int_aggregate_isolated_bmap =
+					WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(	local_isolated_bmap,
+																									BitwiseOr());
+
+			int is_head_of_visited_int =
+					((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
+			if (is_head_of_visited_int) {
+				isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
+			}
+
+			BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
+		}
+	}
+
+	template<typename IndexType>
+	void flag_isolated_vertices(	IndexType n,
+											int *isolated_bmap,
+											const IndexType *row_ptr,
+											IndexType *degrees,
+											IndexType *nisolated,
+											cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = FLAG_ISOLATED_VERTICES_DIMX;
+
+		grid.x = min(	(IndexType) MAXBLOCKS,
+							(n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
+
+		flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
+																						isolated_bmap,
+																						row_ptr,
+																						degrees,
+																						nisolated);
+		cudaCheckError()
+		;
+	}
+
+	//
+	//
+	//
+	// Some utils functions
+	//
+	//
+
+	//Creates CUB data for graph size n
+	template<typename IndexType>
+	void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
+		// Determine temporary device storage requirements for exclusive prefix scan
+		d_temp_storage = NULL;
+		temp_storage_bytes = 0;
+		IndexType *d_in = NULL, *d_out = NULL;
+		cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+		// Allocate temporary storage for exclusive prefix scan
+		cudaMalloc(&d_temp_storage, temp_storage_bytes);
+	}
+
+	template<typename IndexType>
+	__global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
+		for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
+				u < n;
+				u += gridDim.x * blockDim.x)
+			vec[u] = val;
+
+	}
+
+	template<typename IndexType>
+	void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	__global__ void set_frontier_degree_kernel(	IndexType *frontier_degree,
+																IndexType *frontier,
+																const IndexType *degree,
+																IndexType n) {
+		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+				idx < n;
+				idx += gridDim.x * blockDim.x) {
+			IndexType u = frontier[idx];
+			frontier_degree[idx] = degree[u];
+		}
+	}
+
+	template<typename IndexType>
+	void set_frontier_degree(	IndexType *frontier_degree,
+										IndexType *frontier,
+										const IndexType *degree,
+										IndexType n,
+										cudaStream_t m_stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+		set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
+																					frontier,
+																					degree,
+																					n);
+		cudaCheckError()
+		;
+	}
+
+	template<typename IndexType>
+	void exclusive_sum(	void *d_temp_storage,
+								size_t temp_storage_bytes,
+								IndexType *d_in,
+								IndexType *d_out,
+								IndexType num_items,
+								cudaStream_t m_stream) {
+		if (num_items <= 1)
+			return; //DeviceScan fails if n==1
+		cub::DeviceScan::ExclusiveSum(d_temp_storage,
+												temp_storage_bytes,
+												d_in,
+												d_out,
+												num_items,
+												m_stream);
+	}
+
+	template<typename T>
+	__global__ void fill_vec_kernel(T *vec, T n, T val) {
+		for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
+				idx < n;
+				idx += blockDim.x * gridDim.x)
+			vec[idx] = val;
+	}
+
+	template<typename T>
+	void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
+		dim3 grid, block;
+		block.x = 256;
+		grid.x = (n + block.x - 1) / block.x;
+
+		fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
+		cudaCheckError()
+		;
+	}
+}
+//
diff --git a/src/cugraph.cu b/src/cugraph.cu
index f85b524fde8..762aaa6ee5c 100644
--- a/src/cugraph.cu
+++ b/src/cugraph.cu
@@ -17,6 +17,7 @@
 #include "pagerank.cuh"
 #include "COOtoCSR.cuh"
 #include "utilities/error_utils.h"
+#include "bfs.cuh"
 
 //#include <functions.h>
 
@@ -331,11 +332,32 @@ gdf_error gdf_delete_transpose(gdf_graph *graph) {
   return GDF_SUCCESS;
 }
 
-gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess)
-{ 
+gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess) {
   switch (pagerank->dtype) {
     case GDF_FLOAT32:   return gdf_pagerank_impl<float>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
     case GDF_FLOAT64:   return gdf_pagerank_impl<double>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
     default: return GDF_UNSUPPORTED_DTYPE;
   }
 }
+
+gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_node, bool directed) {
+	GDF_REQUIRE(graph->adjList != nullptr, GDF_VALIDITY_UNSUPPORTED);
+	GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+	GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+	GDF_REQUIRE(distances->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+	GDF_REQUIRE(predecessors->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+
+	int n = graph->adjList->offsets->size - 1;
+	int e = graph->adjList->indices->size;
+	int* offsets_ptr = (int*)graph->adjList->offsets->data;
+	int* indices_ptr = (int*)graph->adjList->indices->data;
+	int* distances_ptr = (int*)distances->data;
+	int* predecessors_ptr = (int*)predecessors->data;
+	int alpha = 15;
+	int beta = 18;
+
+	cugraph::Bfs<int> bfs(n, e, offsets_ptr, indices_ptr, directed, alpha, beta);
+	bfs.configure(distances_ptr, predecessors_ptr, nullptr);
+	bfs.traverse(start_node);
+	return GDF_SUCCESS;
+}
diff --git a/src/utilities/sm_utils.h b/src/utilities/sm_utils.h
new file mode 100644
index 00000000000..d265108256e
--- /dev/null
+++ b/src/utilities/sm_utils.h
@@ -0,0 +1,280 @@
+#pragma once
+
+#ifdef _MSC_VER
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#define DEFAULT_MASK 0xffffffff
+
+#define USE_CG 1
+//(__CUDACC_VER__ >= 80500)
+
+
+namespace cugraph
+{
+namespace utils
+{
+    static __device__ __forceinline__ int lane_id()
+    {
+        int id;
+        asm ( "mov.u32 %0, %%laneid;" : "=r"(id) );
+        return id;
+    }
+
+    static __device__ __forceinline__ int lane_mask_lt()
+    {
+        int mask;
+        asm ( "mov.u32 %0, %%lanemask_lt;" : "=r"(mask) );
+        return mask;
+    }
+
+    static __device__ __forceinline__ int lane_mask_le()
+    {
+        int mask;
+        asm ( "mov.u32 %0, %%lanemask_le;" : "=r"(mask) );
+        return mask;
+    }
+
+    static __device__ __forceinline__ int warp_id()
+    {
+        return threadIdx.x >> 5;
+    }
+
+    static __device__ __forceinline__ unsigned int ballot(int p, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __ballot_sync(mask, p);
+#else
+        return __ballot(p);   
+#endif
+    #else
+        return 0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl(int r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound );
+#else
+        return __shfl(r, lane, bound );
+#endif
+    #else
+        return 0;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl(float r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound );
+#else
+        return __shfl(r, lane, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    /// Warp shuffle down function
+    /** Warp shuffle functions on 64-bit floating point values are not
+    *  natively implemented as of Compute Capability 5.0. This
+    *  implementation has been copied from
+    *  (http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler).
+    *  Once this is natively implemented, this function can be replaced
+    *  by __shfl_down.
+    *
+    */
+    static __device__ __forceinline__ double shfl(double r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_sync(mask, a.x, lane, bound);
+        a.y = __shfl_sync(mask, a.y, lane, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl(a.x, lane, bound);
+        a.y = __shfl(a.y, lane, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl(long long r, int lane, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_sync(mask, a.x, lane, bound);
+        a.y = __shfl_sync(mask, a.y, lane, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl(a.x, lane, bound);
+        a.y = __shfl(a.y, lane, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl_down(int r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_down_sync( mask, r, offset, bound );
+#else
+        return __shfl_down( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl_down(float r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_down_sync( mask, r, offset, bound );
+#else
+        return __shfl_down( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ double shfl_down(double r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(a.x, offset, bound);
+        a.y = __shfl_down(a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl_down(long long r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(a.x, offset, bound);
+        a.y = __shfl_down(a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    // specifically for triangles counting
+    static __device__ __forceinline__ uint64_t shfl_down(uint64_t r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down_sync(mask, a.x, offset, bound);
+        a.y = __shfl_down_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<uint64_t*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_down(mask, a.x, offset, bound);
+        a.y = __shfl_down(mask, a.y, offset, bound);
+        return *reinterpret_cast<uint64_t*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ int shfl_up(int r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_up_sync( mask, r, offset, bound );
+#else
+        return __shfl_up( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ float shfl_up(float r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        return __shfl_up_sync( mask, r, offset, bound );
+#else
+        return __shfl_up( r, offset, bound );
+#endif
+    #else
+        return 0.0f;
+    #endif
+    }
+
+    static __device__ __forceinline__ double shfl_up(double r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up_sync(mask, a.x, offset, bound);
+        a.y = __shfl_up_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up(a.x, offset, bound);
+        a.y = __shfl_up(a.y, offset, bound);
+        return *reinterpret_cast<double*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+
+    static __device__ __forceinline__ long long shfl_up(long long r, int offset, int bound = 32, int mask = DEFAULT_MASK)
+    {
+    #if __CUDA_ARCH__ >= 300
+#ifdef USE_CG
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up_sync(mask, a.x, offset, bound);
+        a.y = __shfl_up_sync(mask, a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#else
+        int2 a = *reinterpret_cast<int2*>(&r);
+        a.x = __shfl_up(a.x, offset, bound);
+        a.y = __shfl_up(a.y, offset, bound);
+        return *reinterpret_cast<long long*>(&a);
+#endif
+    #else
+        return 0.0;
+    #endif
+    }
+}
+
+}

From 0a11c9f15176898ece55aadfd045d383bb9be83f Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Wed, 30 Jan 2019 16:50:11 -0700
Subject: [PATCH 2/6] Cleanup after merge with master

---
 CMakeLists.txt             |    4 -
 python/bfs/bfs_wrapper.cpp | 6885 ------------------------------------
 python/bfs/bfs_wrapper.pyx |  142 -
 python/cugraph.pyx         |    1 +
 4 files changed, 1 insertion(+), 7031 deletions(-)
 delete mode 100644 python/bfs/bfs_wrapper.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1c410cbb8e8..740820d19c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -183,7 +183,6 @@ include_directories(
     "${CMAKE_CURRENT_SOURCE_DIR}/include" 
     "${CMAKE_CURRENT_SOURCE_DIR}/src"
     "${CMAKE_CURRENT_SOURCE_DIR}/external/cub"
-    "${CUDA_INCLUDE_DIRS}" 
     "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" 
     "${CUDF_INCLUDE}"
     "${NVGRAPH_INCLUDE}"
@@ -206,11 +205,8 @@ add_library(cugraph SHARED
     src/grmat.cu
     src/cugraph.cu
     src/pagerank.cu
-<<<<<<< HEAD
     src/bfs.cu
-=======
     src/nvgraph_gdf.cu
->>>>>>> master
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/test_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/error_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/misc_utils.cu
diff --git a/python/bfs/bfs_wrapper.cpp b/python/bfs/bfs_wrapper.cpp
deleted file mode 100644
index 76e6447d377..00000000000
--- a/python/bfs/bfs_wrapper.cpp
+++ /dev/null
@@ -1,6885 +0,0 @@
-/* Generated by Cython 0.28.5 */
-
-/* BEGIN: Cython Metadata
-{
-    "distutils": {
-        "depends": [],
-        "extra_compile_args": [
-            "-std=c++11"
-        ],
-        "include_dirs": [
-            "/home/jwyles/anaconda3/envs/cugraph_dev/lib/python3.5/site-packages/numpy/core/include",
-            "/home/jwyles/anaconda3/envs/cugraph_dev/include",
-            "src",
-            "include",
-            "../gunrock",
-            "../gunrock/externals/moderngpu/include",
-            "../gunrock/externals/cub"
-        ],
-        "language": "c++",
-        "libraries": [
-            "cugraph",
-            "cudf"
-        ],
-        "library_dirs": [
-            "/home/jwyles/anaconda3/envs/cugraph_dev/lib/python3.5/site-packages"
-        ],
-        "name": "cugraph",
-        "sources": [
-            "python/pagerank/pagerank_wrapper.pyx",
-            "python/bfs/bfs_wrapper.pyx"
-        ]
-    },
-    "module_name": "cugraph"
-}
-END: Cython Metadata */
-
-#define PY_SSIZE_T_CLEAN
-#include "Python.h"
-#ifndef Py_PYTHON_H
-    #error Python headers needed to compile C extensions, please install development version of Python.
-#elif PY_VERSION_HEX < 0x02060000 || (0x03000000 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x03030000)
-    #error Cython requires Python 2.6+ or Python 3.3+.
-#else
-#define CYTHON_ABI "0_28_5"
-#define CYTHON_FUTURE_DIVISION 0
-#include <stddef.h>
-#ifndef offsetof
-  #define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
-#endif
-#if !defined(WIN32) && !defined(MS_WINDOWS)
-  #ifndef __stdcall
-    #define __stdcall
-  #endif
-  #ifndef __cdecl
-    #define __cdecl
-  #endif
-  #ifndef __fastcall
-    #define __fastcall
-  #endif
-#endif
-#ifndef DL_IMPORT
-  #define DL_IMPORT(t) t
-#endif
-#ifndef DL_EXPORT
-  #define DL_EXPORT(t) t
-#endif
-#define __PYX_COMMA ,
-#ifndef HAVE_LONG_LONG
-  #if PY_VERSION_HEX >= 0x02070000
-    #define HAVE_LONG_LONG
-  #endif
-#endif
-#ifndef PY_LONG_LONG
-  #define PY_LONG_LONG LONG_LONG
-#endif
-#ifndef Py_HUGE_VAL
-  #define Py_HUGE_VAL HUGE_VAL
-#endif
-#ifdef PYPY_VERSION
-  #define CYTHON_COMPILING_IN_PYPY 1
-  #define CYTHON_COMPILING_IN_PYSTON 0
-  #define CYTHON_COMPILING_IN_CPYTHON 0
-  #undef CYTHON_USE_TYPE_SLOTS
-  #define CYTHON_USE_TYPE_SLOTS 0
-  #undef CYTHON_USE_PYTYPE_LOOKUP
-  #define CYTHON_USE_PYTYPE_LOOKUP 0
-  #if PY_VERSION_HEX < 0x03050000
-    #undef CYTHON_USE_ASYNC_SLOTS
-    #define CYTHON_USE_ASYNC_SLOTS 0
-  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
-    #define CYTHON_USE_ASYNC_SLOTS 1
-  #endif
-  #undef CYTHON_USE_PYLIST_INTERNALS
-  #define CYTHON_USE_PYLIST_INTERNALS 0
-  #undef CYTHON_USE_UNICODE_INTERNALS
-  #define CYTHON_USE_UNICODE_INTERNALS 0
-  #undef CYTHON_USE_UNICODE_WRITER
-  #define CYTHON_USE_UNICODE_WRITER 0
-  #undef CYTHON_USE_PYLONG_INTERNALS
-  #define CYTHON_USE_PYLONG_INTERNALS 0
-  #undef CYTHON_AVOID_BORROWED_REFS
-  #define CYTHON_AVOID_BORROWED_REFS 1
-  #undef CYTHON_ASSUME_SAFE_MACROS
-  #define CYTHON_ASSUME_SAFE_MACROS 0
-  #undef CYTHON_UNPACK_METHODS
-  #define CYTHON_UNPACK_METHODS 0
-  #undef CYTHON_FAST_THREAD_STATE
-  #define CYTHON_FAST_THREAD_STATE 0
-  #undef CYTHON_FAST_PYCALL
-  #define CYTHON_FAST_PYCALL 0
-  #undef CYTHON_PEP489_MULTI_PHASE_INIT
-  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
-  #undef CYTHON_USE_TP_FINALIZE
-  #define CYTHON_USE_TP_FINALIZE 0
-#elif defined(PYSTON_VERSION)
-  #define CYTHON_COMPILING_IN_PYPY 0
-  #define CYTHON_COMPILING_IN_PYSTON 1
-  #define CYTHON_COMPILING_IN_CPYTHON 0
-  #ifndef CYTHON_USE_TYPE_SLOTS
-    #define CYTHON_USE_TYPE_SLOTS 1
-  #endif
-  #undef CYTHON_USE_PYTYPE_LOOKUP
-  #define CYTHON_USE_PYTYPE_LOOKUP 0
-  #undef CYTHON_USE_ASYNC_SLOTS
-  #define CYTHON_USE_ASYNC_SLOTS 0
-  #undef CYTHON_USE_PYLIST_INTERNALS
-  #define CYTHON_USE_PYLIST_INTERNALS 0
-  #ifndef CYTHON_USE_UNICODE_INTERNALS
-    #define CYTHON_USE_UNICODE_INTERNALS 1
-  #endif
-  #undef CYTHON_USE_UNICODE_WRITER
-  #define CYTHON_USE_UNICODE_WRITER 0
-  #undef CYTHON_USE_PYLONG_INTERNALS
-  #define CYTHON_USE_PYLONG_INTERNALS 0
-  #ifndef CYTHON_AVOID_BORROWED_REFS
-    #define CYTHON_AVOID_BORROWED_REFS 0
-  #endif
-  #ifndef CYTHON_ASSUME_SAFE_MACROS
-    #define CYTHON_ASSUME_SAFE_MACROS 1
-  #endif
-  #ifndef CYTHON_UNPACK_METHODS
-    #define CYTHON_UNPACK_METHODS 1
-  #endif
-  #undef CYTHON_FAST_THREAD_STATE
-  #define CYTHON_FAST_THREAD_STATE 0
-  #undef CYTHON_FAST_PYCALL
-  #define CYTHON_FAST_PYCALL 0
-  #undef CYTHON_PEP489_MULTI_PHASE_INIT
-  #define CYTHON_PEP489_MULTI_PHASE_INIT 0
-  #undef CYTHON_USE_TP_FINALIZE
-  #define CYTHON_USE_TP_FINALIZE 0
-#else
-  #define CYTHON_COMPILING_IN_PYPY 0
-  #define CYTHON_COMPILING_IN_PYSTON 0
-  #define CYTHON_COMPILING_IN_CPYTHON 1
-  #ifndef CYTHON_USE_TYPE_SLOTS
-    #define CYTHON_USE_TYPE_SLOTS 1
-  #endif
-  #if PY_VERSION_HEX < 0x02070000
-    #undef CYTHON_USE_PYTYPE_LOOKUP
-    #define CYTHON_USE_PYTYPE_LOOKUP 0
-  #elif !defined(CYTHON_USE_PYTYPE_LOOKUP)
-    #define CYTHON_USE_PYTYPE_LOOKUP 1
-  #endif
-  #if PY_MAJOR_VERSION < 3
-    #undef CYTHON_USE_ASYNC_SLOTS
-    #define CYTHON_USE_ASYNC_SLOTS 0
-  #elif !defined(CYTHON_USE_ASYNC_SLOTS)
-    #define CYTHON_USE_ASYNC_SLOTS 1
-  #endif
-  #if PY_VERSION_HEX < 0x02070000
-    #undef CYTHON_USE_PYLONG_INTERNALS
-    #define CYTHON_USE_PYLONG_INTERNALS 0
-  #elif !defined(CYTHON_USE_PYLONG_INTERNALS)
-    #define CYTHON_USE_PYLONG_INTERNALS 1
-  #endif
-  #ifndef CYTHON_USE_PYLIST_INTERNALS
-    #define CYTHON_USE_PYLIST_INTERNALS 1
-  #endif
-  #ifndef CYTHON_USE_UNICODE_INTERNALS
-    #define CYTHON_USE_UNICODE_INTERNALS 1
-  #endif
-  #if PY_VERSION_HEX < 0x030300F0
-    #undef CYTHON_USE_UNICODE_WRITER
-    #define CYTHON_USE_UNICODE_WRITER 0
-  #elif !defined(CYTHON_USE_UNICODE_WRITER)
-    #define CYTHON_USE_UNICODE_WRITER 1
-  #endif
-  #ifndef CYTHON_AVOID_BORROWED_REFS
-    #define CYTHON_AVOID_BORROWED_REFS 0
-  #endif
-  #ifndef CYTHON_ASSUME_SAFE_MACROS
-    #define CYTHON_ASSUME_SAFE_MACROS 1
-  #endif
-  #ifndef CYTHON_UNPACK_METHODS
-    #define CYTHON_UNPACK_METHODS 1
-  #endif
-  #ifndef CYTHON_FAST_THREAD_STATE
-    #define CYTHON_FAST_THREAD_STATE 1
-  #endif
-  #ifndef CYTHON_FAST_PYCALL
-    #define CYTHON_FAST_PYCALL 1
-  #endif
-  #ifndef CYTHON_PEP489_MULTI_PHASE_INIT
-    #define CYTHON_PEP489_MULTI_PHASE_INIT (0 && PY_VERSION_HEX >= 0x03050000)
-  #endif
-  #ifndef CYTHON_USE_TP_FINALIZE
-    #define CYTHON_USE_TP_FINALIZE (PY_VERSION_HEX >= 0x030400a1)
-  #endif
-#endif
-#if !defined(CYTHON_FAST_PYCCALL)
-#define CYTHON_FAST_PYCCALL  (CYTHON_FAST_PYCALL && PY_VERSION_HEX >= 0x030600B1)
-#endif
-#if CYTHON_USE_PYLONG_INTERNALS
-  #include "longintrepr.h"
-  #undef SHIFT
-  #undef BASE
-  #undef MASK
-#endif
-#ifndef __has_attribute
-  #define __has_attribute(x) 0
-#endif
-#ifndef __has_cpp_attribute
-  #define __has_cpp_attribute(x) 0
-#endif
-#ifndef CYTHON_RESTRICT
-  #if defined(__GNUC__)
-    #define CYTHON_RESTRICT __restrict__
-  #elif defined(_MSC_VER) && _MSC_VER >= 1400
-    #define CYTHON_RESTRICT __restrict
-  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-    #define CYTHON_RESTRICT restrict
-  #else
-    #define CYTHON_RESTRICT
-  #endif
-#endif
-#ifndef CYTHON_UNUSED
-# if defined(__GNUC__)
-#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
-#     define CYTHON_UNUSED __attribute__ ((__unused__))
-#   else
-#     define CYTHON_UNUSED
-#   endif
-# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
-#   define CYTHON_UNUSED __attribute__ ((__unused__))
-# else
-#   define CYTHON_UNUSED
-# endif
-#endif
-#ifndef CYTHON_MAYBE_UNUSED_VAR
-#  if defined(__cplusplus)
-     template<class T> void CYTHON_MAYBE_UNUSED_VAR( const T& ) { }
-#  else
-#    define CYTHON_MAYBE_UNUSED_VAR(x) (void)(x)
-#  endif
-#endif
-#ifndef CYTHON_NCP_UNUSED
-# if CYTHON_COMPILING_IN_CPYTHON
-#  define CYTHON_NCP_UNUSED
-# else
-#  define CYTHON_NCP_UNUSED CYTHON_UNUSED
-# endif
-#endif
-#define __Pyx_void_to_None(void_result) ((void)(void_result), Py_INCREF(Py_None), Py_None)
-#ifdef _MSC_VER
-    #ifndef _MSC_STDINT_H_
-        #if _MSC_VER < 1300
-           typedef unsigned char     uint8_t;
-           typedef unsigned int      uint32_t;
-        #else
-           typedef unsigned __int8   uint8_t;
-           typedef unsigned __int32  uint32_t;
-        #endif
-    #endif
-#else
-   #include <stdint.h>
-#endif
-#ifndef CYTHON_FALLTHROUGH
-  #if defined(__cplusplus) && __cplusplus >= 201103L
-    #if __has_cpp_attribute(fallthrough)
-      #define CYTHON_FALLTHROUGH [[fallthrough]]
-    #elif __has_cpp_attribute(clang::fallthrough)
-      #define CYTHON_FALLTHROUGH [[clang::fallthrough]]
-    #elif __has_cpp_attribute(gnu::fallthrough)
-      #define CYTHON_FALLTHROUGH [[gnu::fallthrough]]
-    #endif
-  #endif
-  #ifndef CYTHON_FALLTHROUGH
-    #if __has_attribute(fallthrough)
-      #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
-    #else
-      #define CYTHON_FALLTHROUGH
-    #endif
-  #endif
-  #if defined(__clang__ ) && defined(__apple_build_version__)
-    #if __apple_build_version__ < 7000000
-      #undef  CYTHON_FALLTHROUGH
-      #define CYTHON_FALLTHROUGH
-    #endif
-  #endif
-#endif
-
-#ifndef __cplusplus
-  #error "Cython files generated with the C++ option must be compiled with a C++ compiler."
-#endif
-#ifndef CYTHON_INLINE
-  #if defined(__clang__)
-    #define CYTHON_INLINE __inline__ __attribute__ ((__unused__))
-  #else
-    #define CYTHON_INLINE inline
-  #endif
-#endif
-template<typename T>
-void __Pyx_call_destructor(T& x) {
-    x.~T();
-}
-template<typename T>
-class __Pyx_FakeReference {
-  public:
-    __Pyx_FakeReference() : ptr(NULL) { }
-    __Pyx_FakeReference(const T& ref) : ptr(const_cast<T*>(&ref)) { }
-    T *operator->() { return ptr; }
-    T *operator&() { return ptr; }
-    operator T&() { return *ptr; }
-    template<typename U> bool operator ==(U other) { return *ptr == other; }
-    template<typename U> bool operator !=(U other) { return *ptr != other; }
-  private:
-    T *ptr;
-};
-
-#if CYTHON_COMPILING_IN_PYPY && PY_VERSION_HEX < 0x02070600 && !defined(Py_OptimizeFlag)
-  #define Py_OptimizeFlag 0
-#endif
-#define __PYX_BUILD_PY_SSIZE_T "n"
-#define CYTHON_FORMAT_SSIZE_T "z"
-#if PY_MAJOR_VERSION < 3
-  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
-  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
-          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
-  #define __Pyx_DefaultClassType PyClass_Type
-#else
-  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
-  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)\
-          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
-  #define __Pyx_DefaultClassType PyType_Type
-#endif
-#ifndef Py_TPFLAGS_CHECKTYPES
-  #define Py_TPFLAGS_CHECKTYPES 0
-#endif
-#ifndef Py_TPFLAGS_HAVE_INDEX
-  #define Py_TPFLAGS_HAVE_INDEX 0
-#endif
-#ifndef Py_TPFLAGS_HAVE_NEWBUFFER
-  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
-#endif
-#ifndef Py_TPFLAGS_HAVE_FINALIZE
-  #define Py_TPFLAGS_HAVE_FINALIZE 0
-#endif
-#if PY_VERSION_HEX <= 0x030700A3 || !defined(METH_FASTCALL)
-  #ifndef METH_FASTCALL
-     #define METH_FASTCALL 0x80
-  #endif
-  typedef PyObject *(*__Pyx_PyCFunctionFast) (PyObject *self, PyObject *const *args, Py_ssize_t nargs);
-  typedef PyObject *(*__Pyx_PyCFunctionFastWithKeywords) (PyObject *self, PyObject *const *args,
-                                                          Py_ssize_t nargs, PyObject *kwnames);
-#else
-  #define __Pyx_PyCFunctionFast _PyCFunctionFast
-  #define __Pyx_PyCFunctionFastWithKeywords _PyCFunctionFastWithKeywords
-#endif
-#if CYTHON_FAST_PYCCALL
-#define __Pyx_PyFastCFunction_Check(func)\
-    ((PyCFunction_Check(func) && (METH_FASTCALL == (PyCFunction_GET_FLAGS(func) & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)))))
-#else
-#define __Pyx_PyFastCFunction_Check(func) 0
-#endif
-#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Malloc)
-  #define PyObject_Malloc(s)   PyMem_Malloc(s)
-  #define PyObject_Free(p)     PyMem_Free(p)
-  #define PyObject_Realloc(p)  PyMem_Realloc(p)
-#endif
-#if CYTHON_COMPILING_IN_PYSTON
-  #define __Pyx_PyCode_HasFreeVars(co)  PyCode_HasFreeVars(co)
-  #define __Pyx_PyFrame_SetLineNumber(frame, lineno) PyFrame_SetLineNumber(frame, lineno)
-#else
-  #define __Pyx_PyCode_HasFreeVars(co)  (PyCode_GetNumFree(co) > 0)
-  #define __Pyx_PyFrame_SetLineNumber(frame, lineno)  (frame)->f_lineno = (lineno)
-#endif
-#if !CYTHON_FAST_THREAD_STATE || PY_VERSION_HEX < 0x02070000
-  #define __Pyx_PyThreadState_Current PyThreadState_GET()
-#elif PY_VERSION_HEX >= 0x03060000
-  #define __Pyx_PyThreadState_Current _PyThreadState_UncheckedGet()
-#elif PY_VERSION_HEX >= 0x03000000
-  #define __Pyx_PyThreadState_Current PyThreadState_GET()
-#else
-  #define __Pyx_PyThreadState_Current _PyThreadState_Current
-#endif
-#if PY_VERSION_HEX < 0x030700A2 && !defined(PyThread_tss_create) && !defined(Py_tss_NEEDS_INIT)
-#include "pythread.h"
-#define Py_tss_NEEDS_INIT 0
-typedef int Py_tss_t;
-static CYTHON_INLINE int PyThread_tss_create(Py_tss_t *key) {
-  *key = PyThread_create_key();
-  return 0; // PyThread_create_key reports success always
-}
-static CYTHON_INLINE Py_tss_t * PyThread_tss_alloc(void) {
-  Py_tss_t *key = (Py_tss_t *)PyObject_Malloc(sizeof(Py_tss_t));
-  *key = Py_tss_NEEDS_INIT;
-  return key;
-}
-static CYTHON_INLINE void PyThread_tss_free(Py_tss_t *key) {
-  PyObject_Free(key);
-}
-static CYTHON_INLINE int PyThread_tss_is_created(Py_tss_t *key) {
-  return *key != Py_tss_NEEDS_INIT;
-}
-static CYTHON_INLINE void PyThread_tss_delete(Py_tss_t *key) {
-  PyThread_delete_key(*key);
-  *key = Py_tss_NEEDS_INIT;
-}
-static CYTHON_INLINE int PyThread_tss_set(Py_tss_t *key, void *value) {
-  return PyThread_set_key_value(*key, value);
-}
-static CYTHON_INLINE void * PyThread_tss_get(Py_tss_t *key) {
-  return PyThread_get_key_value(*key);
-}
-#endif // TSS (Thread Specific Storage) API
-#if CYTHON_COMPILING_IN_CPYTHON || defined(_PyDict_NewPresized)
-#define __Pyx_PyDict_NewPresized(n)  ((n <= 8) ? PyDict_New() : _PyDict_NewPresized(n))
-#else
-#define __Pyx_PyDict_NewPresized(n)  PyDict_New()
-#endif
-#if PY_MAJOR_VERSION >= 3 || CYTHON_FUTURE_DIVISION
-  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
-  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
-#else
-  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
-  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
-#endif
-#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1 && CYTHON_USE_UNICODE_INTERNALS
-#define __Pyx_PyDict_GetItemStr(dict, name)  _PyDict_GetItem_KnownHash(dict, name, ((PyASCIIObject *) name)->hash)
-#else
-#define __Pyx_PyDict_GetItemStr(dict, name)  PyDict_GetItem(dict, name)
-#endif
-#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
-  #define CYTHON_PEP393_ENABLED 1
-  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ?\
-                                              0 : _PyUnicode_Ready((PyObject *)(op)))
-  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
-  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
-  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   PyUnicode_MAX_CHAR_VALUE(u)
-  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
-  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
-  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
-  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
-  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
-#else
-  #define CYTHON_PEP393_ENABLED 0
-  #define PyUnicode_1BYTE_KIND  1
-  #define PyUnicode_2BYTE_KIND  2
-  #define PyUnicode_4BYTE_KIND  4
-  #define __Pyx_PyUnicode_READY(op)       (0)
-  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
-  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
-  #define __Pyx_PyUnicode_MAX_CHAR_VALUE(u)   ((sizeof(Py_UNICODE) == 2) ? 65535 : 1114111)
-  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
-  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
-  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
-  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  (((void)(k)), ((Py_UNICODE*)d)[i] = ch)
-  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_SIZE(u))
-#endif
-#if CYTHON_COMPILING_IN_PYPY
-  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
-  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
-#else
-  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
-  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ?\
-      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
-#endif
-#if CYTHON_COMPILING_IN_PYPY && !defined(PyUnicode_Contains)
-  #define PyUnicode_Contains(u, s)  PySequence_Contains(u, s)
-#endif
-#if CYTHON_COMPILING_IN_PYPY && !defined(PyByteArray_Check)
-  #define PyByteArray_Check(obj)  PyObject_TypeCheck(obj, &PyByteArray_Type)
-#endif
-#if CYTHON_COMPILING_IN_PYPY && !defined(PyObject_Format)
-  #define PyObject_Format(obj, fmt)  PyObject_CallMethod(obj, "__format__", "O", fmt)
-#endif
-#define __Pyx_PyString_FormatSafe(a, b)   ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
-#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
-#if PY_MAJOR_VERSION >= 3
-  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
-#else
-  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
-#endif
-#if PY_MAJOR_VERSION < 3 && !defined(PyObject_ASCII)
-  #define PyObject_ASCII(o)            PyObject_Repr(o)
-#endif
-#if PY_MAJOR_VERSION >= 3
-  #define PyBaseString_Type            PyUnicode_Type
-  #define PyStringObject               PyUnicodeObject
-  #define PyString_Type                PyUnicode_Type
-  #define PyString_Check               PyUnicode_Check
-  #define PyString_CheckExact          PyUnicode_CheckExact
-  #define PyObject_Unicode             PyObject_Str
-#endif
-#if PY_MAJOR_VERSION >= 3
-  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
-  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
-#else
-  #define __Pyx_PyBaseString_Check(obj) (PyString_Check(obj) || PyUnicode_Check(obj))
-  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
-#endif
-#ifndef PySet_CheckExact
-  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
-#endif
-#if CYTHON_ASSUME_SAFE_MACROS
-  #define __Pyx_PySequence_SIZE(seq)  Py_SIZE(seq)
-#else
-  #define __Pyx_PySequence_SIZE(seq)  PySequence_Size(seq)
-#endif
-#if PY_MAJOR_VERSION >= 3
-  #define PyIntObject                  PyLongObject
-  #define PyInt_Type                   PyLong_Type
-  #define PyInt_Check(op)              PyLong_Check(op)
-  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
-  #define PyInt_FromString             PyLong_FromString
-  #define PyInt_FromUnicode            PyLong_FromUnicode
-  #define PyInt_FromLong               PyLong_FromLong
-  #define PyInt_FromSize_t             PyLong_FromSize_t
-  #define PyInt_FromSsize_t            PyLong_FromSsize_t
-  #define PyInt_AsLong                 PyLong_AsLong
-  #define PyInt_AS_LONG                PyLong_AS_LONG
-  #define PyInt_AsSsize_t              PyLong_AsSsize_t
-  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
-  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
-  #define PyNumber_Int                 PyNumber_Long
-#endif
-#if PY_MAJOR_VERSION >= 3
-  #define PyBoolObject                 PyLongObject
-#endif
-#if PY_MAJOR_VERSION >= 3 && CYTHON_COMPILING_IN_PYPY
-  #ifndef PyUnicode_InternFromString
-    #define PyUnicode_InternFromString(s) PyUnicode_FromString(s)
-  #endif
-#endif
-#if PY_VERSION_HEX < 0x030200A4
-  typedef long Py_hash_t;
-  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
-  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
-#else
-  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
-  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
-#endif
-#if PY_MAJOR_VERSION >= 3
-  #define __Pyx_PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : (Py_INCREF(func), func))
-#else
-  #define __Pyx_PyMethod_New(func, self, klass) PyMethod_New(func, self, klass)
-#endif
-#if CYTHON_USE_ASYNC_SLOTS
-  #if PY_VERSION_HEX >= 0x030500B1
-    #define __Pyx_PyAsyncMethodsStruct PyAsyncMethods
-    #define __Pyx_PyType_AsAsync(obj) (Py_TYPE(obj)->tp_as_async)
-  #else
-    #define __Pyx_PyType_AsAsync(obj) ((__Pyx_PyAsyncMethodsStruct*) (Py_TYPE(obj)->tp_reserved))
-  #endif
-#else
-  #define __Pyx_PyType_AsAsync(obj) NULL
-#endif
-#ifndef __Pyx_PyAsyncMethodsStruct
-    typedef struct {
-        unaryfunc am_await;
-        unaryfunc am_aiter;
-        unaryfunc am_anext;
-    } __Pyx_PyAsyncMethodsStruct;
-#endif
-
-#if defined(WIN32) || defined(MS_WINDOWS)
-  #define _USE_MATH_DEFINES
-#endif
-#include <math.h>
-#ifdef NAN
-#define __PYX_NAN() ((float) NAN)
-#else
-static CYTHON_INLINE float __PYX_NAN() {
-  float value;
-  memset(&value, 0xFF, sizeof(value));
-  return value;
-}
-#endif
-#if defined(__CYGWIN__) && defined(_LDBL_EQ_DBL)
-#define __Pyx_truncl trunc
-#else
-#define __Pyx_truncl truncl
-#endif
-
-
-#define __PYX_ERR(f_index, lineno, Ln_error) \
-{ \
-  __pyx_filename = __pyx_f[f_index]; __pyx_lineno = lineno; __pyx_clineno = __LINE__; goto Ln_error; \
-}
-
-#ifndef __PYX_EXTERN_C
-  #ifdef __cplusplus
-    #define __PYX_EXTERN_C extern "C"
-  #else
-    #define __PYX_EXTERN_C extern
-  #endif
-#endif
-
-#define __PYX_HAVE__bfs_wrapper
-#define __PYX_HAVE_API__bfs_wrapper
-/* Early includes */
-#include "cudf.h"
-#include "cugraph.h"
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif /* _OPENMP */
-
-#if defined(PYREX_WITHOUT_ASSERTIONS) && !defined(CYTHON_WITHOUT_ASSERTIONS)
-#define CYTHON_WITHOUT_ASSERTIONS
-#endif
-
-typedef struct {PyObject **p; const char *s; const Py_ssize_t n; const char* encoding;
-                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry;
-
-#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
-#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
-#define __PYX_DEFAULT_STRING_ENCODING ""
-#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
-#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
-#define __Pyx_uchar_cast(c) ((unsigned char)c)
-#define __Pyx_long_cast(x) ((long)x)
-#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (\
-    (sizeof(type) < sizeof(Py_ssize_t))  ||\
-    (sizeof(type) > sizeof(Py_ssize_t) &&\
-          likely(v < (type)PY_SSIZE_T_MAX ||\
-                 v == (type)PY_SSIZE_T_MAX)  &&\
-          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||\
-                                v == (type)PY_SSIZE_T_MIN)))  ||\
-    (sizeof(type) == sizeof(Py_ssize_t) &&\
-          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||\
-                               v == (type)PY_SSIZE_T_MAX)))  )
-#if defined (__cplusplus) && __cplusplus >= 201103L
-    #include <cstdlib>
-    #define __Pyx_sst_abs(value) std::abs(value)
-#elif SIZEOF_INT >= SIZEOF_SIZE_T
-    #define __Pyx_sst_abs(value) abs(value)
-#elif SIZEOF_LONG >= SIZEOF_SIZE_T
-    #define __Pyx_sst_abs(value) labs(value)
-#elif defined (_MSC_VER)
-    #define __Pyx_sst_abs(value) ((Py_ssize_t)_abs64(value))
-#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-    #define __Pyx_sst_abs(value) llabs(value)
-#elif defined (__GNUC__)
-    #define __Pyx_sst_abs(value) __builtin_llabs(value)
-#else
-    #define __Pyx_sst_abs(value) ((value<0) ? -value : value)
-#endif
-static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject*);
-static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
-#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
-#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
-#define __Pyx_PyBytes_FromString        PyBytes_FromString
-#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
-static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char*);
-#if PY_MAJOR_VERSION < 3
-    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
-    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
-#else
-    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
-    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
-#endif
-#define __Pyx_PyBytes_AsWritableString(s)     ((char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyBytes_AsWritableSString(s)    ((signed char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyBytes_AsWritableUString(s)    ((unsigned char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyBytes_AsString(s)     ((const char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyBytes_AsSString(s)    ((const signed char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyBytes_AsUString(s)    ((const unsigned char*) PyBytes_AS_STRING(s))
-#define __Pyx_PyObject_AsWritableString(s)    ((char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_AsWritableSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_AsWritableUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_AsSString(s)    ((const signed char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_AsUString(s)    ((const unsigned char*) __Pyx_PyObject_AsString(s))
-#define __Pyx_PyObject_FromCString(s)  __Pyx_PyObject_FromString((const char*)s)
-#define __Pyx_PyBytes_FromCString(s)   __Pyx_PyBytes_FromString((const char*)s)
-#define __Pyx_PyByteArray_FromCString(s)   __Pyx_PyByteArray_FromString((const char*)s)
-#define __Pyx_PyStr_FromCString(s)     __Pyx_PyStr_FromString((const char*)s)
-#define __Pyx_PyUnicode_FromCString(s) __Pyx_PyUnicode_FromString((const char*)s)
-static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u) {
-    const Py_UNICODE *u_end = u;
-    while (*u_end++) ;
-    return (size_t)(u_end - u - 1);
-}
-#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
-#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
-#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
-#define __Pyx_NewRef(obj) (Py_INCREF(obj), obj)
-#define __Pyx_Owned_Py_None(b) __Pyx_NewRef(Py_None)
-static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b);
-static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
-static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x);
-#define __Pyx_PySequence_Tuple(obj)\
-    (likely(PyTuple_CheckExact(obj)) ? __Pyx_NewRef(obj) : PySequence_Tuple(obj))
-static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
-static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
-#if CYTHON_ASSUME_SAFE_MACROS
-#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
-#else
-#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
-#endif
-#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
-#if PY_MAJOR_VERSION >= 3
-#define __Pyx_PyNumber_Int(x) (PyLong_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Long(x))
-#else
-#define __Pyx_PyNumber_Int(x) (PyInt_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Int(x))
-#endif
-#define __Pyx_PyNumber_Float(x) (PyFloat_CheckExact(x) ? __Pyx_NewRef(x) : PyNumber_Float(x))
-#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-static int __Pyx_sys_getdefaultencoding_not_ascii;
-static int __Pyx_init_sys_getdefaultencoding_params(void) {
-    PyObject* sys;
-    PyObject* default_encoding = NULL;
-    PyObject* ascii_chars_u = NULL;
-    PyObject* ascii_chars_b = NULL;
-    const char* default_encoding_c;
-    sys = PyImport_ImportModule("sys");
-    if (!sys) goto bad;
-    default_encoding = PyObject_CallMethod(sys, (char*) "getdefaultencoding", NULL);
-    Py_DECREF(sys);
-    if (!default_encoding) goto bad;
-    default_encoding_c = PyBytes_AsString(default_encoding);
-    if (!default_encoding_c) goto bad;
-    if (strcmp(default_encoding_c, "ascii") == 0) {
-        __Pyx_sys_getdefaultencoding_not_ascii = 0;
-    } else {
-        char ascii_chars[128];
-        int c;
-        for (c = 0; c < 128; c++) {
-            ascii_chars[c] = c;
-        }
-        __Pyx_sys_getdefaultencoding_not_ascii = 1;
-        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
-        if (!ascii_chars_u) goto bad;
-        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
-        if (!ascii_chars_b || !PyBytes_Check(ascii_chars_b) || memcmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
-            PyErr_Format(
-                PyExc_ValueError,
-                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
-                default_encoding_c);
-            goto bad;
-        }
-        Py_DECREF(ascii_chars_u);
-        Py_DECREF(ascii_chars_b);
-    }
-    Py_DECREF(default_encoding);
-    return 0;
-bad:
-    Py_XDECREF(default_encoding);
-    Py_XDECREF(ascii_chars_u);
-    Py_XDECREF(ascii_chars_b);
-    return -1;
-}
-#endif
-#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
-#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
-#else
-#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
-#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
-static char* __PYX_DEFAULT_STRING_ENCODING;
-static int __Pyx_init_sys_getdefaultencoding_params(void) {
-    PyObject* sys;
-    PyObject* default_encoding = NULL;
-    char* default_encoding_c;
-    sys = PyImport_ImportModule("sys");
-    if (!sys) goto bad;
-    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
-    Py_DECREF(sys);
-    if (!default_encoding) goto bad;
-    default_encoding_c = PyBytes_AsString(default_encoding);
-    if (!default_encoding_c) goto bad;
-    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
-    if (!__PYX_DEFAULT_STRING_ENCODING) goto bad;
-    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
-    Py_DECREF(default_encoding);
-    return 0;
-bad:
-    Py_XDECREF(default_encoding);
-    return -1;
-}
-#endif
-#endif
-
-
-/* Test for GCC > 2.95 */
-#if defined(__GNUC__)     && (__GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95)))
-  #define likely(x)   __builtin_expect(!!(x), 1)
-  #define unlikely(x) __builtin_expect(!!(x), 0)
-#else /* !__GNUC__ or GCC < 2.95 */
-  #define likely(x)   (x)
-  #define unlikely(x) (x)
-#endif /* __GNUC__ */
-static CYTHON_INLINE void __Pyx_pretend_to_initialize(void* ptr) { (void)ptr; }
-
-static PyObject *__pyx_m = NULL;
-static PyObject *__pyx_d;
-static PyObject *__pyx_b;
-static PyObject *__pyx_cython_runtime = NULL;
-static PyObject *__pyx_empty_tuple;
-static PyObject *__pyx_empty_bytes;
-static PyObject *__pyx_empty_unicode;
-static int __pyx_lineno;
-static int __pyx_clineno = 0;
-static const char * __pyx_cfilenm= __FILE__;
-static const char *__pyx_filename;
-
-
-static const char *__pyx_f[] = {
-  "python/bfs/bfs_wrapper.pyx",
-};
-
-/*--- Type declarations ---*/
-struct __pyx_opt_args_11bfs_wrapper_bfs;
-
-/* "bfs_wrapper.pyx":152
- *         gdf_add_transpose(<gdf_graph*>graph)
- * 
- * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
- *     """
- *     Find the distances and predecessors for a breadth first traversal of a graph.
- */
-struct __pyx_opt_args_11bfs_wrapper_bfs {
-  int __pyx_n;
-  PyObject *directed;
-};
-
-/* --- Runtime support code (head) --- */
-/* Refnanny.proto */
-#ifndef CYTHON_REFNANNY
-  #define CYTHON_REFNANNY 0
-#endif
-#if CYTHON_REFNANNY
-  typedef struct {
-    void (*INCREF)(void*, PyObject*, int);
-    void (*DECREF)(void*, PyObject*, int);
-    void (*GOTREF)(void*, PyObject*, int);
-    void (*GIVEREF)(void*, PyObject*, int);
-    void* (*SetupContext)(const char*, int, const char*);
-    void (*FinishContext)(void**);
-  } __Pyx_RefNannyAPIStruct;
-  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
-  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname);
-  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
-#ifdef WITH_THREAD
-  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
-          if (acquire_gil) {\
-              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();\
-              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
-              PyGILState_Release(__pyx_gilstate_save);\
-          } else {\
-              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__);\
-          }
-#else
-  #define __Pyx_RefNannySetupContext(name, acquire_gil)\
-          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
-#endif
-  #define __Pyx_RefNannyFinishContext()\
-          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
-  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
-  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
-  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
-  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
-  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
-  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
-  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
-  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
-#else
-  #define __Pyx_RefNannyDeclarations
-  #define __Pyx_RefNannySetupContext(name, acquire_gil)
-  #define __Pyx_RefNannyFinishContext()
-  #define __Pyx_INCREF(r) Py_INCREF(r)
-  #define __Pyx_DECREF(r) Py_DECREF(r)
-  #define __Pyx_GOTREF(r)
-  #define __Pyx_GIVEREF(r)
-  #define __Pyx_XINCREF(r) Py_XINCREF(r)
-  #define __Pyx_XDECREF(r) Py_XDECREF(r)
-  #define __Pyx_XGOTREF(r)
-  #define __Pyx_XGIVEREF(r)
-#endif
-#define __Pyx_XDECREF_SET(r, v) do {\
-        PyObject *tmp = (PyObject *) r;\
-        r = v; __Pyx_XDECREF(tmp);\
-    } while (0)
-#define __Pyx_DECREF_SET(r, v) do {\
-        PyObject *tmp = (PyObject *) r;\
-        r = v; __Pyx_DECREF(tmp);\
-    } while (0)
-#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
-#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
-
-/* PyObjectGetAttrStr.proto */
-#if CYTHON_USE_TYPE_SLOTS
-static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name);
-#else
-#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
-#endif
-
-/* GetBuiltinName.proto */
-static PyObject *__Pyx_GetBuiltinName(PyObject *name);
-
-/* GetModuleGlobalName.proto */
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name);
-
-/* PyCFunctionFastCall.proto */
-#if CYTHON_FAST_PYCCALL
-static CYTHON_INLINE PyObject *__Pyx_PyCFunction_FastCall(PyObject *func, PyObject **args, Py_ssize_t nargs);
-#else
-#define __Pyx_PyCFunction_FastCall(func, args, nargs)  (assert(0), NULL)
-#endif
-
-/* PyFunctionFastCall.proto */
-#if CYTHON_FAST_PYCALL
-#define __Pyx_PyFunction_FastCall(func, args, nargs)\
-    __Pyx_PyFunction_FastCallDict((func), (args), (nargs), NULL)
-#if 1 || PY_VERSION_HEX < 0x030600B1
-static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs);
-#else
-#define __Pyx_PyFunction_FastCallDict(func, args, nargs, kwargs) _PyFunction_FastCallDict(func, args, nargs, kwargs)
-#endif
-#endif
-
-/* PyObjectCall.proto */
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw);
-#else
-#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
-#endif
-
-/* PyObjectCallMethO.proto */
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg);
-#endif
-
-/* PyObjectCallOneArg.proto */
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg);
-
-/* PyObjectCallNoArg.proto */
-#if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func);
-#else
-#define __Pyx_PyObject_CallNoArg(func) __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL)
-#endif
-
-/* GetItemInt.proto */
-#define __Pyx_GetItemInt(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
-    __Pyx_GetItemInt_Fast(o, (Py_ssize_t)i, is_list, wraparound, boundscheck) :\
-    (is_list ? (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL) :\
-               __Pyx_GetItemInt_Generic(o, to_py_func(i))))
-#define __Pyx_GetItemInt_List(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
-    __Pyx_GetItemInt_List_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
-    (PyErr_SetString(PyExc_IndexError, "list index out of range"), (PyObject*)NULL))
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck);
-#define __Pyx_GetItemInt_Tuple(o, i, type, is_signed, to_py_func, is_list, wraparound, boundscheck)\
-    (__Pyx_fits_Py_ssize_t(i, type, is_signed) ?\
-    __Pyx_GetItemInt_Tuple_Fast(o, (Py_ssize_t)i, wraparound, boundscheck) :\
-    (PyErr_SetString(PyExc_IndexError, "tuple index out of range"), (PyObject*)NULL))
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
-                                                              int wraparound, int boundscheck);
-static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j);
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i,
-                                                     int is_list, int wraparound, int boundscheck);
-
-/* ObjectGetItem.proto */
-#if CYTHON_USE_TYPE_SLOTS
-static CYTHON_INLINE PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key);
-#else
-#define __Pyx_PyObject_GetItem(obj, key)  PyObject_GetItem(obj, key)
-#endif
-
-/* PyObjectSetAttrStr.proto */
-#if CYTHON_USE_TYPE_SLOTS
-#define __Pyx_PyObject_DelAttrStr(o,n) __Pyx_PyObject_SetAttrStr(o, n, NULL)
-static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value);
-#else
-#define __Pyx_PyObject_DelAttrStr(o,n)   PyObject_DelAttr(o,n)
-#define __Pyx_PyObject_SetAttrStr(o,n,v) PyObject_SetAttr(o,n,v)
-#endif
-
-/* RaiseArgTupleInvalid.proto */
-static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
-    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found);
-
-/* RaiseDoubleKeywords.proto */
-static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name);
-
-/* ParseKeywords.proto */
-static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[],\
-    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args,\
-    const char* function_name);
-
-/* Import.proto */
-static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level);
-
-/* ImportFrom.proto */
-static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name);
-
-/* FetchCommonType.proto */
-static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type);
-
-/* CythonFunction.proto */
-#define __Pyx_CyFunction_USED 1
-#define __Pyx_CYFUNCTION_STATICMETHOD  0x01
-#define __Pyx_CYFUNCTION_CLASSMETHOD   0x02
-#define __Pyx_CYFUNCTION_CCLASS        0x04
-#define __Pyx_CyFunction_GetClosure(f)\
-    (((__pyx_CyFunctionObject *) (f))->func_closure)
-#define __Pyx_CyFunction_GetClassObj(f)\
-    (((__pyx_CyFunctionObject *) (f))->func_classobj)
-#define __Pyx_CyFunction_Defaults(type, f)\
-    ((type *)(((__pyx_CyFunctionObject *) (f))->defaults))
-#define __Pyx_CyFunction_SetDefaultsGetter(f, g)\
-    ((__pyx_CyFunctionObject *) (f))->defaults_getter = (g)
-typedef struct {
-    PyCFunctionObject func;
-#if PY_VERSION_HEX < 0x030500A0
-    PyObject *func_weakreflist;
-#endif
-    PyObject *func_dict;
-    PyObject *func_name;
-    PyObject *func_qualname;
-    PyObject *func_doc;
-    PyObject *func_globals;
-    PyObject *func_code;
-    PyObject *func_closure;
-    PyObject *func_classobj;
-    void *defaults;
-    int defaults_pyobjects;
-    int flags;
-    PyObject *defaults_tuple;
-    PyObject *defaults_kwdict;
-    PyObject *(*defaults_getter)(PyObject *);
-    PyObject *func_annotations;
-} __pyx_CyFunctionObject;
-static PyTypeObject *__pyx_CyFunctionType = 0;
-#define __Pyx_CyFunction_NewEx(ml, flags, qualname, self, module, globals, code)\
-    __Pyx_CyFunction_New(__pyx_CyFunctionType, ml, flags, qualname, self, module, globals, code)
-static PyObject *__Pyx_CyFunction_New(PyTypeObject *, PyMethodDef *ml,
-                                      int flags, PyObject* qualname,
-                                      PyObject *self,
-                                      PyObject *module, PyObject *globals,
-                                      PyObject* code);
-static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *m,
-                                                         size_t size,
-                                                         int pyobjects);
-static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *m,
-                                                            PyObject *tuple);
-static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *m,
-                                                             PyObject *dict);
-static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *m,
-                                                              PyObject *dict);
-static int __pyx_CyFunction_init(void);
-
-/* SetNameInClass.proto */
-#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
-#define __Pyx_SetNameInClass(ns, name, value)\
-    (likely(PyDict_CheckExact(ns)) ? _PyDict_SetItem_KnownHash(ns, name, value, ((PyASCIIObject *) name)->hash) : PyObject_SetItem(ns, name, value))
-#elif CYTHON_COMPILING_IN_CPYTHON
-#define __Pyx_SetNameInClass(ns, name, value)\
-    (likely(PyDict_CheckExact(ns)) ? PyDict_SetItem(ns, name, value) : PyObject_SetItem(ns, name, value))
-#else
-#define __Pyx_SetNameInClass(ns, name, value)  PyObject_SetItem(ns, name, value)
-#endif
-
-/* CalculateMetaclass.proto */
-static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases);
-
-/* Py3ClassCreate.proto */
-static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name, PyObject *qualname,
-                                           PyObject *mkw, PyObject *modname, PyObject *doc);
-static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases, PyObject *dict,
-                                      PyObject *mkw, int calculate_metaclass, int allow_py2_metaclass);
-
-/* PyThreadStateGet.proto */
-#if CYTHON_FAST_THREAD_STATE
-#define __Pyx_PyThreadState_declare  PyThreadState *__pyx_tstate;
-#define __Pyx_PyThreadState_assign  __pyx_tstate = __Pyx_PyThreadState_Current;
-#define __Pyx_PyErr_Occurred()  __pyx_tstate->curexc_type
-#else
-#define __Pyx_PyThreadState_declare
-#define __Pyx_PyThreadState_assign
-#define __Pyx_PyErr_Occurred()  PyErr_Occurred()
-#endif
-
-/* PyErrFetchRestore.proto */
-#if CYTHON_FAST_THREAD_STATE
-#define __Pyx_PyErr_Clear() __Pyx_ErrRestore(NULL, NULL, NULL)
-#define __Pyx_ErrRestoreWithState(type, value, tb)  __Pyx_ErrRestoreInState(PyThreadState_GET(), type, value, tb)
-#define __Pyx_ErrFetchWithState(type, value, tb)    __Pyx_ErrFetchInState(PyThreadState_GET(), type, value, tb)
-#define __Pyx_ErrRestore(type, value, tb)  __Pyx_ErrRestoreInState(__pyx_tstate, type, value, tb)
-#define __Pyx_ErrFetch(type, value, tb)    __Pyx_ErrFetchInState(__pyx_tstate, type, value, tb)
-static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb);
-static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb);
-#if CYTHON_COMPILING_IN_CPYTHON
-#define __Pyx_PyErr_SetNone(exc) (Py_INCREF(exc), __Pyx_ErrRestore((exc), NULL, NULL))
-#else
-#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
-#endif
-#else
-#define __Pyx_PyErr_Clear() PyErr_Clear()
-#define __Pyx_PyErr_SetNone(exc) PyErr_SetNone(exc)
-#define __Pyx_ErrRestoreWithState(type, value, tb)  PyErr_Restore(type, value, tb)
-#define __Pyx_ErrFetchWithState(type, value, tb)  PyErr_Fetch(type, value, tb)
-#define __Pyx_ErrRestoreInState(tstate, type, value, tb)  PyErr_Restore(type, value, tb)
-#define __Pyx_ErrFetchInState(tstate, type, value, tb)  PyErr_Fetch(type, value, tb)
-#define __Pyx_ErrRestore(type, value, tb)  PyErr_Restore(type, value, tb)
-#define __Pyx_ErrFetch(type, value, tb)  PyErr_Fetch(type, value, tb)
-#endif
-
-/* CLineInTraceback.proto */
-#ifdef CYTHON_CLINE_IN_TRACEBACK
-#define __Pyx_CLineForTraceback(tstate, c_line)  (((CYTHON_CLINE_IN_TRACEBACK)) ? c_line : 0)
-#else
-static int __Pyx_CLineForTraceback(PyThreadState *tstate, int c_line);
-#endif
-
-/* CodeObjectCache.proto */
-typedef struct {
-    PyCodeObject* code_object;
-    int code_line;
-} __Pyx_CodeObjectCacheEntry;
-struct __Pyx_CodeObjectCache {
-    int count;
-    int max_count;
-    __Pyx_CodeObjectCacheEntry* entries;
-};
-static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
-static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
-static PyCodeObject *__pyx_find_code_object(int code_line);
-static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
-
-/* AddTraceback.proto */
-static void __Pyx_AddTraceback(const char *funcname, int c_line,
-                               int py_line, const char *filename);
-
-/* CIntToPy.proto */
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_gdf_dtype(gdf_dtype value);
-
-/* Print.proto */
-static int __Pyx_Print(PyObject*, PyObject *, int);
-#if CYTHON_COMPILING_IN_PYPY || PY_MAJOR_VERSION >= 3
-static PyObject* __pyx_print = 0;
-static PyObject* __pyx_print_kwargs = 0;
-#endif
-
-/* CIntFromPy.proto */
-static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *);
-
-/* CIntFromPy.proto */
-static CYTHON_INLINE gdf_dtype __Pyx_PyInt_As_gdf_dtype(PyObject *);
-
-/* CIntFromPy.proto */
-static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
-
-/* PrintOne.proto */
-static int __Pyx_PrintOne(PyObject* stream, PyObject *o);
-
-/* CIntToPy.proto */
-static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
-
-/* CIntFromPy.proto */
-static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
-
-/* FastTypeChecks.proto */
-#if CYTHON_COMPILING_IN_CPYTHON
-#define __Pyx_TypeCheck(obj, type) __Pyx_IsSubtype(Py_TYPE(obj), (PyTypeObject *)type)
-static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b);
-static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject *type);
-static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *type1, PyObject *type2);
-#else
-#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
-#define __Pyx_PyErr_GivenExceptionMatches(err, type) PyErr_GivenExceptionMatches(err, type)
-#define __Pyx_PyErr_GivenExceptionMatches2(err, type1, type2) (PyErr_GivenExceptionMatches(err, type1) || PyErr_GivenExceptionMatches(err, type2))
-#endif
-#define __Pyx_PyException_Check(obj) __Pyx_TypeCheck(obj, PyExc_Exception)
-
-/* CheckBinaryVersion.proto */
-static int __Pyx_check_binary_version(void);
-
-/* InitStrings.proto */
-static int __Pyx_InitStrings(__Pyx_StringTabEntry *t);
-
-
-/* Module declarations from 'libcpp' */
-
-/* Module declarations from 'c_bfs' */
-
-/* Module declarations from 'libc.stdint' */
-
-/* Module declarations from 'libc.string' */
-
-/* Module declarations from 'libc.stdlib' */
-
-/* Module declarations from 'bfs_wrapper' */
-static PyObject *__pyx_f_11bfs_wrapper_create_column(PyObject *); /*proto*/
-static PyObject *__pyx_f_11bfs_wrapper_bfs(PyObject *, PyObject *, int __pyx_skip_dispatch, struct __pyx_opt_args_11bfs_wrapper_bfs *__pyx_optional_args); /*proto*/
-#define __Pyx_MODULE_NAME "bfs_wrapper"
-extern int __pyx_module_is_main_bfs_wrapper;
-int __pyx_module_is_main_bfs_wrapper = 0;
-
-/* Implementation of 'bfs_wrapper' */
-static const char __pyx_k_G[] = "G";
-static const char __pyx_k_g[] = "g";
-static const char __pyx_k_np[] = "np";
-static const char __pyx_k_doc[] = "__doc__";
-static const char __pyx_k_end[] = "end";
-static const char __pyx_k_gdf[] = "_gdf";
-static const char __pyx_k_obj[] = "obj";
-static const char __pyx_k_rmm[] = "rmm";
-static const char __pyx_k_cudf[] = "cudf";
-static const char __pyx_k_data[] = "_data";
-static const char __pyx_k_dest[] = "dest";
-static const char __pyx_k_file[] = "file";
-static const char __pyx_k_init[] = "__init__";
-static const char __pyx_k_main[] = "__main__";
-static const char __pyx_k_mask[] = "_mask";
-static const char __pyx_k_self[] = "self";
-static const char __pyx_k_size[] = "size";
-static const char __pyx_k_test[] = "__test__";
-static const char __pyx_k_type[] = "type";
-static const char __pyx_k_Graph[] = "Graph";
-static const char __pyx_k_dtype[] = "dtype";
-static const char __pyx_k_graph[] = "graph";
-static const char __pyx_k_int32[] = "int32";
-static const char __pyx_k_int64[] = "int64";
-static const char __pyx_k_numpy[] = "numpy";
-static const char __pyx_k_print[] = "print";
-static const char __pyx_k_start[] = "start";
-static const char __pyx_k_value[] = "value";
-static const char __pyx_k_zeros[] = "zeros";
-static const char __pyx_k_Series[] = "Series";
-static const char __pyx_k_column[] = "_column";
-static const char __pyx_k_data_2[] = "data";
-static const char __pyx_k_dtypes[] = "dtypes";
-static const char __pyx_k_import[] = "__import__";
-static const char __pyx_k_librmm[] = "librmm";
-static const char __pyx_k_module[] = "__module__";
-static const char __pyx_k_source[] = "source";
-static const char __pyx_k_float32[] = "float32";
-static const char __pyx_k_float64[] = "float64";
-static const char __pyx_k_indices[] = "indices";
-static const char __pyx_k_offsets[] = "offsets";
-static const char __pyx_k_prepare[] = "__prepare__";
-static const char __pyx_k_dest_col[] = "dest_col";
-static const char __pyx_k_directed[] = "directed";
-static const char __pyx_k_qualname[] = "__qualname__";
-static const char __pyx_k_cffi_view[] = "cffi_view";
-static const char __pyx_k_graph_ptr[] = "graph_ptr";
-static const char __pyx_k_metaclass[] = "__metaclass__";
-static const char __pyx_k_value_col[] = "value_col";
-static const char __pyx_k_null_count[] = "null_count";
-static const char __pyx_k_source_col[] = "source_col";
-static const char __pyx_k_bfs_wrapper[] = "bfs_wrapper";
-static const char __pyx_k_indices_col[] = "indices_col";
-static const char __pyx_k_librmm_cffi[] = "librmm_cffi";
-static const char __pyx_k_offsets_col[] = "offsets_col";
-static const char __pyx_k_Graph___init[] = "Graph.__init__";
-static const char __pyx_k_add_adj_list[] = "add_adj_list";
-static const char __pyx_k_bfs_line_152[] = "bfs (line 152)";
-static const char __pyx_k_to_gpu_array[] = "to_gpu_array";
-static const char __pyx_k_add_edge_list[] = "add_edge_list";
-static const char __pyx_k_add_transpose[] = "add_transpose";
-static const char __pyx_k_get_ctype_ptr[] = "_get_ctype_ptr";
-static const char __pyx_k_view_edge_list[] = "view_edge_list";
-static const char __pyx_k_Graph_add_adj_list[] = "Graph.add_adj_list";
-static const char __pyx_k_cline_in_traceback[] = "cline_in_traceback";
-static const char __pyx_k_Graph_add_edge_list[] = "Graph.add_edge_list";
-static const char __pyx_k_Graph_add_transpose[] = "Graph.add_transpose";
-static const char __pyx_k_get_column_data_ptr[] = "_get_column_data_ptr";
-static const char __pyx_k_Graph_view_edge_list[] = "Graph.view_edge_list";
-static const char __pyx_k_get_column_valid_ptr[] = "_get_column_valid_ptr";
-static const char __pyx_k_device_ctypes_pointer[] = "device_ctypes_pointer";
-static const char __pyx_k_Graph___init___line_48[] = "Graph.__init__ (line 48)";
-static const char __pyx_k_cffi_view_to_column_mem[] = "cffi_view_to_column_mem";
-static const char __pyx_k_python_bfs_bfs_wrapper_pyx[] = "python/bfs/bfs_wrapper.pyx";
-static const char __pyx_k_Graph_add_edge_list_line_66[] = "Graph.add_edge_list (line 66)";
-static const char __pyx_k_cuGraph_graph_class_containing[] = "\n        cuGraph graph class containing basic graph creation and transformation operations.\n    ";
-static const char __pyx_k_Find_the_distances_and_predeces[] = "\n    Find the distances and predecessors for a breadth first traversal of a graph.\n    \n    Parameters\n    ----------\n    G : cugraph.graph\n        cuGraph graph descriptor, should contain the connectivity information as an\n        adjacency list.\n    start : Integer\n        The index of the graph vertex from which the traversal begins\n    directed : bool\n        Indicates whether the graph in question is a directed graph, or whether\n        each edge has a corresponding reverse edge. (Allows optimizations if the\n        graph is undirected)\n    \n    Returns\n    -------\n    distances, predecessors : cudf.Series\n        distances gives the path distance for each vertex from the starting vertex\n        predecessors gives for each vertex the vertex it was reached from in the traversal\n        \n    Examples\n    --------\n    >>> M = ReadMtxFile(graph_file)\n    >>> sources = cudf.Series(M.row)\n    >>> destinations = cudf.Series(M.col)\n    >>> G = cuGraph.Graph()\n    >>> G.add_edge_list(sources,destinations,none)\n    >>> dist, pred = cuGraph.bfs(G, 0, false)\n    ";
-static const char __pyx_k_Returns_Graph_cuGraph_Graph_Exa[] = "\n        Returns\n        -------\n        Graph : cuGraph.Graph.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> G = cuGraph.Graph()\n        ";
-static const char __pyx_k_Warp_existing_gdf_columns_repre[] = "\n        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. \n        The cuGraph graph should not already contain the connectivity information as an edge list.\n        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).\n\n        Parameters\n        ----------\n        source_indices : gdf_column       \n            This gdf_column of size E (number of edges) contains the index of the source for each edge.\n            Indices must be in the range [0, V-1]. \n        destination_indices   : gdf_column\n            This gdf_column of size E (number of edges) contains the index of the destination for each edge. \n            Indices must be in the range [0, V-1].\n        edge_data (optional)  : gdf_column\n            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. \n            The type expected to be floating point.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> import cudf\n        >>> from scipy.io import mmread\n        >>> M = ReadMtxFile(graph_file)\n        >>> sources = cudf.Series(M.row)\n        >>> destinations = cudf.Series(M.col)\n        >>> G = cuGraph.Graph()\n        >>> G.add_edge_list(sources,destinations,none)\n        \n        ";
-static PyObject *__pyx_kp_u_Find_the_distances_and_predeces;
-static PyObject *__pyx_n_s_G;
-static PyObject *__pyx_n_s_Graph;
-static PyObject *__pyx_n_s_Graph___init;
-static PyObject *__pyx_kp_u_Graph___init___line_48;
-static PyObject *__pyx_n_s_Graph_add_adj_list;
-static PyObject *__pyx_n_s_Graph_add_edge_list;
-static PyObject *__pyx_kp_u_Graph_add_edge_list_line_66;
-static PyObject *__pyx_n_s_Graph_add_transpose;
-static PyObject *__pyx_n_s_Graph_view_edge_list;
-static PyObject *__pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa;
-static PyObject *__pyx_n_s_Series;
-static PyObject *__pyx_kp_u_Warp_existing_gdf_columns_repre;
-static PyObject *__pyx_n_s_add_adj_list;
-static PyObject *__pyx_n_s_add_edge_list;
-static PyObject *__pyx_n_s_add_transpose;
-static PyObject *__pyx_kp_u_bfs_line_152;
-static PyObject *__pyx_n_s_bfs_wrapper;
-static PyObject *__pyx_n_s_cffi_view;
-static PyObject *__pyx_n_s_cffi_view_to_column_mem;
-static PyObject *__pyx_n_s_cline_in_traceback;
-static PyObject *__pyx_n_s_column;
-static PyObject *__pyx_kp_s_cuGraph_graph_class_containing;
-static PyObject *__pyx_n_s_cudf;
-static PyObject *__pyx_n_s_data;
-static PyObject *__pyx_n_s_data_2;
-static PyObject *__pyx_n_s_dest;
-static PyObject *__pyx_n_s_dest_col;
-static PyObject *__pyx_n_s_device_ctypes_pointer;
-static PyObject *__pyx_n_s_directed;
-static PyObject *__pyx_n_s_doc;
-static PyObject *__pyx_n_s_dtype;
-static PyObject *__pyx_n_s_dtypes;
-static PyObject *__pyx_n_s_end;
-static PyObject *__pyx_n_s_file;
-static PyObject *__pyx_n_s_float32;
-static PyObject *__pyx_n_s_float64;
-static PyObject *__pyx_n_s_g;
-static PyObject *__pyx_n_s_gdf;
-static PyObject *__pyx_n_s_get_column_data_ptr;
-static PyObject *__pyx_n_s_get_column_valid_ptr;
-static PyObject *__pyx_n_s_get_ctype_ptr;
-static PyObject *__pyx_n_s_graph;
-static PyObject *__pyx_n_s_graph_ptr;
-static PyObject *__pyx_n_s_import;
-static PyObject *__pyx_n_s_indices;
-static PyObject *__pyx_n_s_indices_col;
-static PyObject *__pyx_n_s_init;
-static PyObject *__pyx_n_s_int32;
-static PyObject *__pyx_n_s_int64;
-static PyObject *__pyx_n_s_librmm;
-static PyObject *__pyx_n_s_librmm_cffi;
-static PyObject *__pyx_n_s_main;
-static PyObject *__pyx_n_s_mask;
-static PyObject *__pyx_n_s_metaclass;
-static PyObject *__pyx_n_s_module;
-static PyObject *__pyx_n_s_np;
-static PyObject *__pyx_n_s_null_count;
-static PyObject *__pyx_n_s_numpy;
-static PyObject *__pyx_n_s_obj;
-static PyObject *__pyx_n_s_offsets;
-static PyObject *__pyx_n_s_offsets_col;
-static PyObject *__pyx_n_s_prepare;
-static PyObject *__pyx_n_s_print;
-static PyObject *__pyx_kp_s_python_bfs_bfs_wrapper_pyx;
-static PyObject *__pyx_n_s_qualname;
-static PyObject *__pyx_n_s_rmm;
-static PyObject *__pyx_n_s_self;
-static PyObject *__pyx_n_s_size;
-static PyObject *__pyx_n_s_source;
-static PyObject *__pyx_n_s_source_col;
-static PyObject *__pyx_n_s_start;
-static PyObject *__pyx_n_s_test;
-static PyObject *__pyx_n_s_to_gpu_array;
-static PyObject *__pyx_n_s_type;
-static PyObject *__pyx_n_s_value;
-static PyObject *__pyx_n_s_value_col;
-static PyObject *__pyx_n_s_view_edge_list;
-static PyObject *__pyx_n_s_zeros;
-static PyObject *__pyx_pf_11bfs_wrapper__get_ctype_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_2_get_column_data_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_source_col, PyObject *__pyx_v_dest_col, PyObject *__pyx_v_value_col); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_offsets_col, PyObject *__pyx_v_indices_col, PyObject *__pyx_v_value_col); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_8add_transpose(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self); /* proto */
-static PyObject *__pyx_pf_11bfs_wrapper_6bfs(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_G, PyObject *__pyx_v_start, PyObject *__pyx_v_directed); /* proto */
-static PyObject *__pyx_int_0;
-static PyObject *__pyx_tuple_;
-static PyObject *__pyx_tuple__3;
-static PyObject *__pyx_tuple__5;
-static PyObject *__pyx_tuple__7;
-static PyObject *__pyx_tuple__9;
-static PyObject *__pyx_tuple__11;
-static PyObject *__pyx_tuple__12;
-static PyObject *__pyx_tuple__14;
-static PyObject *__pyx_tuple__16;
-static PyObject *__pyx_codeobj__2;
-static PyObject *__pyx_codeobj__4;
-static PyObject *__pyx_codeobj__6;
-static PyObject *__pyx_codeobj__8;
-static PyObject *__pyx_codeobj__10;
-static PyObject *__pyx_codeobj__13;
-static PyObject *__pyx_codeobj__15;
-static PyObject *__pyx_codeobj__17;
-/* Late includes */
-
-/* "bfs_wrapper.pyx":12
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
- * 
- * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
- *     # The manner to access the pointers in the gdf's might change, so
- *     # encapsulating access in the following 3 methods. They might also be
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_1_get_ctype_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
-static PyMethodDef __pyx_mdef_11bfs_wrapper_1_get_ctype_ptr = {"_get_ctype_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_1_get_ctype_ptr, METH_O, 0};
-static PyObject *__pyx_pw_11bfs_wrapper_1_get_ctype_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("_get_ctype_ptr (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper__get_ctype_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper__get_ctype_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  __Pyx_RefNannySetupContext("_get_ctype_ptr", 0);
-
-  /* "bfs_wrapper.pyx":16
- *     # encapsulating access in the following 3 methods. They might also be
- *     # part of future gdf versions.
- *     return obj.device_ctypes_pointer.value             # <<<<<<<<<<<<<<
- * 
- * def _get_column_data_ptr(obj):
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_device_ctypes_pointer); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_value); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 16, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_r = __pyx_t_2;
-  __pyx_t_2 = 0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":12
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
- * 
- * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
- *     # The manner to access the pointers in the gdf's might change, so
- *     # encapsulating access in the following 3 methods. They might also be
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_AddTraceback("bfs_wrapper._get_ctype_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":18
- *     return obj.device_ctypes_pointer.value
- * 
- * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_3_get_column_data_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
-static PyMethodDef __pyx_mdef_11bfs_wrapper_3_get_column_data_ptr = {"_get_column_data_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_3_get_column_data_ptr, METH_O, 0};
-static PyObject *__pyx_pw_11bfs_wrapper_3_get_column_data_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("_get_column_data_ptr (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper_2_get_column_data_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_2_get_column_data_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  PyObject *__pyx_t_5 = NULL;
-  __Pyx_RefNannySetupContext("_get_column_data_ptr", 0);
-
-  /* "bfs_wrapper.pyx":19
- * 
- * def _get_column_data_ptr(obj):
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())             # <<<<<<<<<<<<<<
- * 
- * def _get_column_valid_ptr(obj):
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_ctype_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 19, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_column); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_data); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 19, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_5);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_gpu_array); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 19, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  __pyx_t_5 = NULL;
-  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
-    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
-    if (likely(__pyx_t_5)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
-      __Pyx_INCREF(__pyx_t_5);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_4, function);
-    }
-  }
-  if (__pyx_t_5) {
-    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 19, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  } else {
-    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 19, __pyx_L1_error)
-  }
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = NULL;
-  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_4)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_4);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_4) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    } else
-    #endif
-    {
-      __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 19, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_5);
-      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL;
-      __Pyx_GIVEREF(__pyx_t_3);
-      PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
-      __pyx_t_3 = 0;
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":18
- *     return obj.device_ctypes_pointer.value
- * 
- * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_XDECREF(__pyx_t_5);
-  __Pyx_AddTraceback("bfs_wrapper._get_column_data_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":21
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
- * 
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj); /*proto*/
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5_get_column_valid_ptr = {"_get_column_valid_ptr", (PyCFunction)__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr, METH_O, 0};
-static PyObject *__pyx_pw_11bfs_wrapper_5_get_column_valid_ptr(PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("_get_column_valid_ptr (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(__pyx_self, ((PyObject *)__pyx_v_obj));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_4_get_column_valid_ptr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_obj) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  PyObject *__pyx_t_5 = NULL;
-  __Pyx_RefNannySetupContext("_get_column_valid_ptr", 0);
-
-  /* "bfs_wrapper.pyx":22
- * 
- * def _get_column_valid_ptr(obj):
- *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())             # <<<<<<<<<<<<<<
- * 
- * #def _get_gdf_as_matrix_ptr(gdf):
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_ctype_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 22, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_v_obj, __pyx_n_s_column); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 22, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_mask); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 22, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_5);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_to_gpu_array); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 22, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  __pyx_t_5 = NULL;
-  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
-    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
-    if (likely(__pyx_t_5)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
-      __Pyx_INCREF(__pyx_t_5);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_4, function);
-    }
-  }
-  if (__pyx_t_5) {
-    __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_5); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 22, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  } else {
-    __pyx_t_3 = __Pyx_PyObject_CallNoArg(__pyx_t_4); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 22, __pyx_L1_error)
-  }
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = NULL;
-  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_4)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_4);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_4) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_3};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    } else
-    #endif
-    {
-      __pyx_t_5 = PyTuple_New(1+1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 22, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_5);
-      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_4); __pyx_t_4 = NULL;
-      __Pyx_GIVEREF(__pyx_t_3);
-      PyTuple_SET_ITEM(__pyx_t_5, 0+1, __pyx_t_3);
-      __pyx_t_3 = 0;
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":21
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
- * 
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_XDECREF(__pyx_t_5);
-  __Pyx_AddTraceback("bfs_wrapper._get_column_valid_ptr", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":27
- * #    return self._get_ctype_ptr(gdf.as_gpu_matrix())
- * 
- * cdef create_column(col):             # <<<<<<<<<<<<<<
- * 
- *     x= <gdf_column*>malloc(sizeof(gdf_column))
- */
-
-static PyObject *__pyx_f_11bfs_wrapper_create_column(PyObject *__pyx_v_col) {
-  CYTHON_UNUSED gdf_column *__pyx_v_x;
-  gdf_column *__pyx_v_c_col;
-  uintptr_t __pyx_v_data_ptr;
-  uintptr_t __pyx_v_col_ptr;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  uintptr_t __pyx_t_5;
-  Py_ssize_t __pyx_t_6;
-  gdf_dtype __pyx_t_7;
-  gdf_size_type __pyx_t_8;
-  __Pyx_RefNannySetupContext("create_column", 0);
-
-  /* "bfs_wrapper.pyx":29
- * cdef create_column(col):
- * 
- *     x= <gdf_column*>malloc(sizeof(gdf_column))             # <<<<<<<<<<<<<<
- *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
- *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)
- */
-  __pyx_v_x = ((gdf_column *)malloc((sizeof(gdf_column))));
-
-  /* "bfs_wrapper.pyx":30
- * 
- *     x= <gdf_column*>malloc(sizeof(gdf_column))
- *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))             # <<<<<<<<<<<<<<
- *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)
- *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
- */
-  __pyx_v_c_col = ((gdf_column *)malloc((sizeof(gdf_column))));
-
-  /* "bfs_wrapper.pyx":31
- *     x= <gdf_column*>malloc(sizeof(gdf_column))
- *     cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
- *     cdef uintptr_t data_ptr = _get_column_data_ptr(col)             # <<<<<<<<<<<<<<
- *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
- * 
- */
-  __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_get_column_data_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 31, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_3 = NULL;
-  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_2))) {
-    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_2);
-    if (likely(__pyx_t_3)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_2);
-      __Pyx_INCREF(__pyx_t_3);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_2, function);
-    }
-  }
-  if (!__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_2, __pyx_v_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_col};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_2)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_v_col};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_2, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-    } else
-    #endif
-    {
-      __pyx_t_4 = PyTuple_New(1+1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 31, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_4);
-      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3); __pyx_t_3 = NULL;
-      __Pyx_INCREF(__pyx_v_col);
-      __Pyx_GIVEREF(__pyx_v_col);
-      PyTuple_SET_ITEM(__pyx_t_4, 0+1, __pyx_v_col);
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 31, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_5 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_5 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 31, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_data_ptr = __pyx_t_5;
-
-  /* "bfs_wrapper.pyx":37
- *                               <void*> data_ptr,
- *                               <gdf_valid_type*> 0,
- *                               <gdf_size_type>len(col),             # <<<<<<<<<<<<<<
- *                               dtypes[col.dtype.type],
- *                               <gdf_size_type>col.null_count)
- */
-  __pyx_t_6 = PyObject_Length(__pyx_v_col); if (unlikely(__pyx_t_6 == ((Py_ssize_t)-1))) __PYX_ERR(0, 37, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":38
- *                               <gdf_valid_type*> 0,
- *                               <gdf_size_type>len(col),
- *                               dtypes[col.dtype.type],             # <<<<<<<<<<<<<<
- *                               <gdf_size_type>col.null_count)
- * 
- */
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_dtypes); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 38, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_col, __pyx_n_s_dtype); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 38, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_2, __pyx_n_s_type); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 38, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-  __pyx_t_2 = __Pyx_PyObject_GetItem(__pyx_t_1, __pyx_t_4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 38, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_7 = ((gdf_dtype)__Pyx_PyInt_As_gdf_dtype(__pyx_t_2)); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 38, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":39
- *                               <gdf_size_type>len(col),
- *                               dtypes[col.dtype.type],
- *                               <gdf_size_type>col.null_count)             # <<<<<<<<<<<<<<
- * 
- *     cdef uintptr_t col_ptr = <uintptr_t>c_col
- */
-  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_col, __pyx_n_s_null_count); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 39, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_8 = __Pyx_PyInt_As_size_t(__pyx_t_2); if (unlikely((__pyx_t_8 == ((gdf_size_type)-1)) && PyErr_Occurred())) __PYX_ERR(0, 39, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":34
- *     #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
- * 
- *     gdf_column_view_augmented(<gdf_column*>c_col,             # <<<<<<<<<<<<<<
- *                               <void*> data_ptr,
- *                               <gdf_valid_type*> 0,
- */
-  (void)(gdf_column_view_augmented(((gdf_column *)__pyx_v_c_col), ((void *)__pyx_v_data_ptr), ((gdf_valid_type *)0), ((gdf_size_type)__pyx_t_6), __pyx_t_7, ((gdf_size_type)__pyx_t_8)));
-
-  /* "bfs_wrapper.pyx":41
- *                               <gdf_size_type>col.null_count)
- * 
- *     cdef uintptr_t col_ptr = <uintptr_t>c_col             # <<<<<<<<<<<<<<
- *     return col_ptr
- * 
- */
-  __pyx_v_col_ptr = ((uintptr_t)__pyx_v_c_col);
-
-  /* "bfs_wrapper.pyx":42
- * 
- *     cdef uintptr_t col_ptr = <uintptr_t>c_col
- *     return col_ptr             # <<<<<<<<<<<<<<
- * 
- * class Graph:
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_v_col_ptr); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 42, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_r = __pyx_t_2;
-  __pyx_t_2 = 0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":27
- * #    return self._get_ctype_ptr(gdf.as_gpu_matrix())
- * 
- * cdef create_column(col):             # <<<<<<<<<<<<<<
- * 
- *     x= <gdf_column*>malloc(sizeof(gdf_column))
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_AddTraceback("bfs_wrapper.create_column", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":48
- *         cuGraph graph class containing basic graph creation and transformation operations.
- *     """
- *     def __init__(self):             # <<<<<<<<<<<<<<
- *         """
- *         Returns
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_1__init__(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
-static char __pyx_doc_11bfs_wrapper_5Graph___init__[] = "\n        Returns\n        -------\n        Graph : cuGraph.Graph.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> G = cuGraph.Graph()\n        ";
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_1__init__ = {"__init__", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_1__init__, METH_O, __pyx_doc_11bfs_wrapper_5Graph___init__};
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_1__init__(PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__init__ (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph___init__(__pyx_self, ((PyObject *)__pyx_v_self));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph___init__(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  struct gdf_graph *__pyx_v_graph;
-  uintptr_t __pyx_v_graph_ptr;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  __Pyx_RefNannySetupContext("__init__", 0);
-
-  /* "bfs_wrapper.pyx":60
- *         """
- *         cdef gdf_graph* graph
- *         graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))             # <<<<<<<<<<<<<<
- * 
- *         cdef uintptr_t graph_ptr = <uintptr_t>graph
- */
-  __pyx_v_graph = ((struct gdf_graph *)calloc(1, (sizeof(struct gdf_graph))));
-
-  /* "bfs_wrapper.pyx":62
- *         graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))
- * 
- *         cdef uintptr_t graph_ptr = <uintptr_t>graph             # <<<<<<<<<<<<<<
- *         self.graph_ptr = graph_ptr
- * 
- */
-  __pyx_v_graph_ptr = ((uintptr_t)__pyx_v_graph);
-
-  /* "bfs_wrapper.pyx":63
- * 
- *         cdef uintptr_t graph_ptr = <uintptr_t>graph
- *         self.graph_ptr = graph_ptr             # <<<<<<<<<<<<<<
- * 
- * 
- */
-  __pyx_t_1 = __Pyx_PyInt_FromSize_t(__pyx_v_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 63, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_PyObject_SetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr, __pyx_t_1) < 0) __PYX_ERR(0, 63, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":48
- *         cuGraph graph class containing basic graph creation and transformation operations.
- *     """
- *     def __init__(self):             # <<<<<<<<<<<<<<
- *         """
- *         Returns
- */
-
-  /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  goto __pyx_L0;
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("bfs_wrapper.Graph.__init__", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":66
- * 
- * 
- *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static char __pyx_doc_11bfs_wrapper_5Graph_2add_edge_list[] = "\n        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. \n        The cuGraph graph should not already contain the connectivity information as an edge list.\n        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).\n\n        Parameters\n        ----------\n        source_indices : gdf_column       \n            This gdf_column of size E (number of edges) contains the index of the source for each edge.\n            Indices must be in the range [0, V-1]. \n        destination_indices   : gdf_column\n            This gdf_column of size E (number of edges) contains the index of the destination for each edge. \n            Indices must be in the range [0, V-1].\n        edge_data (optional)  : gdf_column\n            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. \n            The type expected to be floating point.\n\n        Examples\n        --------\n        >>> import cuGraph\n        >>> import cudf\n        >>> from scipy.io import mmread\n        >>> M = ReadMtxFile(graph_file)\n        >>> sources = cudf.Series(M.row)\n        >>> destinations = cudf.Series(M.col)\n        >>> G = cuGraph.Graph()\n        >>> G.add_edge_list(sources,destinations,none)\n        \n        ";
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_3add_edge_list = {"add_edge_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_5Graph_2add_edge_list};
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_3add_edge_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  PyObject *__pyx_v_self = 0;
-  PyObject *__pyx_v_source_col = 0;
-  PyObject *__pyx_v_dest_col = 0;
-  PyObject *__pyx_v_value_col = 0;
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("add_edge_list (wrapper)", 0);
-  {
-    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_self,&__pyx_n_s_source_col,&__pyx_n_s_dest_col,&__pyx_n_s_value_col,0};
-    PyObject* values[4] = {0,0,0,0};
-    values[3] = ((PyObject *)((PyObject *)Py_None));
-    if (unlikely(__pyx_kwds)) {
-      Py_ssize_t kw_args;
-      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
-      switch (pos_args) {
-        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
-        CYTHON_FALLTHROUGH;
-        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-        CYTHON_FALLTHROUGH;
-        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-        CYTHON_FALLTHROUGH;
-        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-        CYTHON_FALLTHROUGH;
-        case  0: break;
-        default: goto __pyx_L5_argtuple_error;
-      }
-      kw_args = PyDict_Size(__pyx_kwds);
-      switch (pos_args) {
-        case  0:
-        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_self)) != 0)) kw_args--;
-        else goto __pyx_L5_argtuple_error;
-        CYTHON_FALLTHROUGH;
-        case  1:
-        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_source_col)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, 1); __PYX_ERR(0, 66, __pyx_L3_error)
-        }
-        CYTHON_FALLTHROUGH;
-        case  2:
-        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_dest_col)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, 2); __PYX_ERR(0, 66, __pyx_L3_error)
-        }
-        CYTHON_FALLTHROUGH;
-        case  3:
-        if (kw_args > 0) {
-          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_value_col);
-          if (value) { values[3] = value; kw_args--; }
-        }
-      }
-      if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "add_edge_list") < 0)) __PYX_ERR(0, 66, __pyx_L3_error)
-      }
-    } else {
-      switch (PyTuple_GET_SIZE(__pyx_args)) {
-        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
-        CYTHON_FALLTHROUGH;
-        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-        values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-        values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-        break;
-        default: goto __pyx_L5_argtuple_error;
-      }
-    }
-    __pyx_v_self = values[0];
-    __pyx_v_source_col = values[1];
-    __pyx_v_dest_col = values[2];
-    __pyx_v_value_col = values[3];
-  }
-  goto __pyx_L4_argument_unpacking_done;
-  __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("add_edge_list", 0, 3, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 66, __pyx_L3_error)
-  __pyx_L3_error:;
-  __Pyx_AddTraceback("bfs_wrapper.Graph.add_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __Pyx_RefNannyFinishContext();
-  return NULL;
-  __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(__pyx_self, __pyx_v_self, __pyx_v_source_col, __pyx_v_dest_col, __pyx_v_value_col);
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_2add_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_source_col, PyObject *__pyx_v_dest_col, PyObject *__pyx_v_value_col) {
-  uintptr_t __pyx_v_graph;
-  uintptr_t __pyx_v_source;
-  uintptr_t __pyx_v_dest;
-  uintptr_t __pyx_v_value;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  uintptr_t __pyx_t_2;
-  int __pyx_t_3;
-  int __pyx_t_4;
-  __Pyx_RefNannySetupContext("add_edge_list", 0);
-
-  /* "bfs_wrapper.pyx":97
- *         """
- * 
- *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
- *         cdef uintptr_t source=create_column(source_col)
- *         cdef uintptr_t dest=create_column(dest_col)
- */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 97, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 97, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_graph = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":98
- * 
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef uintptr_t source=create_column(source_col)             # <<<<<<<<<<<<<<
- *         cdef uintptr_t dest=create_column(dest_col)
- *         cdef uintptr_t value
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_source_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 98, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 98, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_source = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":99
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef uintptr_t source=create_column(source_col)
- *         cdef uintptr_t dest=create_column(dest_col)             # <<<<<<<<<<<<<<
- *         cdef uintptr_t value
- *         if value_col is None:
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_dest_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 99, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 99, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_dest = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":101
- *         cdef uintptr_t dest=create_column(dest_col)
- *         cdef uintptr_t value
- *         if value_col is None:             # <<<<<<<<<<<<<<
- *             value = 0
- *         else:
- */
-  __pyx_t_3 = (__pyx_v_value_col == Py_None);
-  __pyx_t_4 = (__pyx_t_3 != 0);
-  if (__pyx_t_4) {
-
-    /* "bfs_wrapper.pyx":102
- *         cdef uintptr_t value
- *         if value_col is None:
- *             value = 0             # <<<<<<<<<<<<<<
- *         else:
- *             value=create_column(value_col)
- */
-    __pyx_v_value = 0;
-
-    /* "bfs_wrapper.pyx":101
- *         cdef uintptr_t dest=create_column(dest_col)
- *         cdef uintptr_t value
- *         if value_col is None:             # <<<<<<<<<<<<<<
- *             value = 0
- *         else:
- */
-    goto __pyx_L3;
-  }
-
-  /* "bfs_wrapper.pyx":104
- *             value = 0
- *         else:
- *             value=create_column(value_col)             # <<<<<<<<<<<<<<
- * 
- *         gdf_edge_list_view(<gdf_graph*>graph,
- */
-  /*else*/ {
-    __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_value_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 104, __pyx_L1_error)
-    __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 104, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __pyx_v_value = __pyx_t_2;
-  }
-  __pyx_L3:;
-
-  /* "bfs_wrapper.pyx":106
- *             value=create_column(value_col)
- * 
- *         gdf_edge_list_view(<gdf_graph*>graph,             # <<<<<<<<<<<<<<
- *                        <gdf_column*>source,
- *                        <gdf_column*>dest,
- */
-  (void)(gdf_edge_list_view(((struct gdf_graph *)__pyx_v_graph), ((gdf_column *)__pyx_v_source), ((gdf_column *)__pyx_v_dest), ((gdf_column *)__pyx_v_value)));
-
-  /* "bfs_wrapper.pyx":66
- * 
- * 
- *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
- */
-
-  /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  goto __pyx_L0;
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("bfs_wrapper.Graph.add_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":111
- *                        <gdf_column*>value)
- * 
- *     def view_edge_list(self):             # <<<<<<<<<<<<<<
- *         ##TO DO
- *         """
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
-static char __pyx_doc_11bfs_wrapper_5Graph_4view_edge_list[] = "\n        Display the edge list.\n        ";
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_5view_edge_list = {"view_edge_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list, METH_O, __pyx_doc_11bfs_wrapper_5Graph_4view_edge_list};
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_5view_edge_list(PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("view_edge_list (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(__pyx_self, ((PyObject *)__pyx_v_self));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_4view_edge_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  uintptr_t __pyx_v_graph;
-  struct gdf_graph *__pyx_v_g;
-  gdf_size_type __pyx_v_size;
-  PyObject *__pyx_v_cffi_view = 0;
-  CYTHON_UNUSED PyObject *__pyx_v_data = NULL;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  uintptr_t __pyx_t_2;
-  gdf_size_type __pyx_t_3;
-  PyObject *__pyx_t_4 = NULL;
-  PyObject *__pyx_t_5 = NULL;
-  PyObject *__pyx_t_6 = NULL;
-  __Pyx_RefNannySetupContext("view_edge_list", 0);
-
-  /* "bfs_wrapper.pyx":116
- *         Display the edge list.
- *         """
- *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
- *         cdef gdf_graph* g = <gdf_graph*>graph
- *         size = g.edgeList.src_indices.size
- */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 116, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 116, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_graph = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":117
- *         """
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef gdf_graph* g = <gdf_graph*>graph             # <<<<<<<<<<<<<<
- *         size = g.edgeList.src_indices.size
- *         print(size)
- */
-  __pyx_v_g = ((struct gdf_graph *)__pyx_v_graph);
-
-  /* "bfs_wrapper.pyx":118
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef gdf_graph* g = <gdf_graph*>graph
- *         size = g.edgeList.src_indices.size             # <<<<<<<<<<<<<<
- *         print(size)
- *         cdef object cffi_view = <object>g.edgeList.src_indices
- */
-  __pyx_t_3 = __pyx_v_g->edgeList->src_indices->size;
-  __pyx_v_size = __pyx_t_3;
-
-  /* "bfs_wrapper.pyx":119
- *         cdef gdf_graph* g = <gdf_graph*>graph
- *         size = g.edgeList.src_indices.size
- *         print(size)             # <<<<<<<<<<<<<<
- *         cdef object cffi_view = <object>g.edgeList.src_indices
- *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
- */
-  __pyx_t_1 = __Pyx_PyInt_FromSize_t(__pyx_v_size); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 119, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_PrintOne(0, __pyx_t_1) < 0) __PYX_ERR(0, 119, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":120
- *         size = g.edgeList.src_indices.size
- *         print(size)
- *         cdef object cffi_view = <object>g.edgeList.src_indices             # <<<<<<<<<<<<<<
- *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
- *         #return pygdf.Series(data)
- */
-  __pyx_t_1 = ((PyObject *)__pyx_v_g->edgeList->src_indices);
-  __Pyx_INCREF(__pyx_t_1);
-  __pyx_v_cffi_view = __pyx_t_1;
-  __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":121
- *         print(size)
- *         cdef object cffi_view = <object>g.edgeList.src_indices
- *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)             # <<<<<<<<<<<<<<
- *         #return pygdf.Series(data)
- *         return 0
- */
-  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 121, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_gdf); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 121, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_5);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_cffi_view_to_column_mem); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 121, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  __pyx_t_5 = NULL;
-  if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_4))) {
-    __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_4);
-    if (likely(__pyx_t_5)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
-      __Pyx_INCREF(__pyx_t_5);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_4, function);
-    }
-  }
-  if (!__pyx_t_5) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_v_cffi_view); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_4)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_cffi_view};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_5, __pyx_v_cffi_view};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-    } else
-    #endif
-    {
-      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 121, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_6);
-      __Pyx_GIVEREF(__pyx_t_5); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_5); __pyx_t_5 = NULL;
-      __Pyx_INCREF(__pyx_v_cffi_view);
-      __Pyx_GIVEREF(__pyx_v_cffi_view);
-      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_v_cffi_view);
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 121, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_v_data = __pyx_t_1;
-  __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":123
- *         data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
- *         #return pygdf.Series(data)
- *         return 0             # <<<<<<<<<<<<<<
- * 
- *     def add_adj_list(self, offsets_col, indices_col, value_col):
- */
-  __Pyx_XDECREF(__pyx_r);
-  __Pyx_INCREF(__pyx_int_0);
-  __pyx_r = __pyx_int_0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":111
- *                        <gdf_column*>value)
- * 
- *     def view_edge_list(self):             # <<<<<<<<<<<<<<
- *         ##TO DO
- *         """
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_XDECREF(__pyx_t_5);
-  __Pyx_XDECREF(__pyx_t_6);
-  __Pyx_AddTraceback("bfs_wrapper.Graph.view_edge_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XDECREF(__pyx_v_cffi_view);
-  __Pyx_XDECREF(__pyx_v_data);
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":125
- *         return 0
- * 
- *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static char __pyx_doc_11bfs_wrapper_5Graph_6add_adj_list[] = "\n        Warp existing gdf columns representing an adjacency list in a gdf_graph.\n        ";
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_7add_adj_list = {"add_adj_list", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_5Graph_6add_adj_list};
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_7add_adj_list(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  PyObject *__pyx_v_self = 0;
-  PyObject *__pyx_v_offsets_col = 0;
-  PyObject *__pyx_v_indices_col = 0;
-  PyObject *__pyx_v_value_col = 0;
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("add_adj_list (wrapper)", 0);
-  {
-    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_self,&__pyx_n_s_offsets_col,&__pyx_n_s_indices_col,&__pyx_n_s_value_col,0};
-    PyObject* values[4] = {0,0,0,0};
-    if (unlikely(__pyx_kwds)) {
-      Py_ssize_t kw_args;
-      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
-      switch (pos_args) {
-        case  4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
-        CYTHON_FALLTHROUGH;
-        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-        CYTHON_FALLTHROUGH;
-        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-        CYTHON_FALLTHROUGH;
-        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-        CYTHON_FALLTHROUGH;
-        case  0: break;
-        default: goto __pyx_L5_argtuple_error;
-      }
-      kw_args = PyDict_Size(__pyx_kwds);
-      switch (pos_args) {
-        case  0:
-        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_self)) != 0)) kw_args--;
-        else goto __pyx_L5_argtuple_error;
-        CYTHON_FALLTHROUGH;
-        case  1:
-        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_offsets_col)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 1); __PYX_ERR(0, 125, __pyx_L3_error)
-        }
-        CYTHON_FALLTHROUGH;
-        case  2:
-        if (likely((values[2] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_indices_col)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 2); __PYX_ERR(0, 125, __pyx_L3_error)
-        }
-        CYTHON_FALLTHROUGH;
-        case  3:
-        if (likely((values[3] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_value_col)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, 3); __PYX_ERR(0, 125, __pyx_L3_error)
-        }
-      }
-      if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "add_adj_list") < 0)) __PYX_ERR(0, 125, __pyx_L3_error)
-      }
-    } else if (PyTuple_GET_SIZE(__pyx_args) != 4) {
-      goto __pyx_L5_argtuple_error;
-    } else {
-      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-      values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
-    }
-    __pyx_v_self = values[0];
-    __pyx_v_offsets_col = values[1];
-    __pyx_v_indices_col = values[2];
-    __pyx_v_value_col = values[3];
-  }
-  goto __pyx_L4_argument_unpacking_done;
-  __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("add_adj_list", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 125, __pyx_L3_error)
-  __pyx_L3_error:;
-  __Pyx_AddTraceback("bfs_wrapper.Graph.add_adj_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __Pyx_RefNannyFinishContext();
-  return NULL;
-  __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(__pyx_self, __pyx_v_self, __pyx_v_offsets_col, __pyx_v_indices_col, __pyx_v_value_col);
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_6add_adj_list(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self, PyObject *__pyx_v_offsets_col, PyObject *__pyx_v_indices_col, PyObject *__pyx_v_value_col) {
-  uintptr_t __pyx_v_graph;
-  uintptr_t __pyx_v_offsets;
-  uintptr_t __pyx_v_indices;
-  uintptr_t __pyx_v_value;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  uintptr_t __pyx_t_2;
-  int __pyx_t_3;
-  int __pyx_t_4;
-  __Pyx_RefNannySetupContext("add_adj_list", 0);
-
-  /* "bfs_wrapper.pyx":130
- *         """
- *         ##TO TEST
- *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
- *         cdef uintptr_t offsets=create_column(offsets_col)
- *         cdef uintptr_t indices=create_column(indices_col)
- */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 130, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 130, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_graph = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":131
- *         ##TO TEST
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef uintptr_t offsets=create_column(offsets_col)             # <<<<<<<<<<<<<<
- *         cdef uintptr_t indices=create_column(indices_col)
- *         cdef uintptr_t value
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_offsets_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 131, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 131, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_offsets = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":132
- *         cdef uintptr_t graph = self.graph_ptr
- *         cdef uintptr_t offsets=create_column(offsets_col)
- *         cdef uintptr_t indices=create_column(indices_col)             # <<<<<<<<<<<<<<
- *         cdef uintptr_t value
- *         if value_col is None:
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_indices_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 132, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 132, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_indices = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":134
- *         cdef uintptr_t indices=create_column(indices_col)
- *         cdef uintptr_t value
- *         if value_col is None:             # <<<<<<<<<<<<<<
- *             value = 0
- *         else:
- */
-  __pyx_t_3 = (__pyx_v_value_col == Py_None);
-  __pyx_t_4 = (__pyx_t_3 != 0);
-  if (__pyx_t_4) {
-
-    /* "bfs_wrapper.pyx":135
- *         cdef uintptr_t value
- *         if value_col is None:
- *             value = 0             # <<<<<<<<<<<<<<
- *         else:
- *             value=create_column(value_col)
- */
-    __pyx_v_value = 0;
-
-    /* "bfs_wrapper.pyx":134
- *         cdef uintptr_t indices=create_column(indices_col)
- *         cdef uintptr_t value
- *         if value_col is None:             # <<<<<<<<<<<<<<
- *             value = 0
- *         else:
- */
-    goto __pyx_L3;
-  }
-
-  /* "bfs_wrapper.pyx":137
- *             value = 0
- *         else:
- *             value=create_column(value_col)             # <<<<<<<<<<<<<<
- * 
- *         gdf_adj_list_view(<gdf_graph*>graph,
- */
-  /*else*/ {
-    __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_value_col); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 137, __pyx_L1_error)
-    __Pyx_GOTREF(__pyx_t_1);
-    __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 137, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-    __pyx_v_value = __pyx_t_2;
-  }
-  __pyx_L3:;
-
-  /* "bfs_wrapper.pyx":139
- *             value=create_column(value_col)
- * 
- *         gdf_adj_list_view(<gdf_graph*>graph,             # <<<<<<<<<<<<<<
- *                        <gdf_column*>offsets,
- *                        <gdf_column*>indices,
- */
-  (void)(gdf_adj_list_view(((struct gdf_graph *)__pyx_v_graph), ((gdf_column *)__pyx_v_offsets), ((gdf_column *)__pyx_v_indices), ((gdf_column *)__pyx_v_value)));
-
-  /* "bfs_wrapper.pyx":125
- *         return 0
- * 
- *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
- */
-
-  /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  goto __pyx_L0;
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("bfs_wrapper.Graph.add_adj_list", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":145
- * 
- * 
- *     def add_transpose(self):             # <<<<<<<<<<<<<<
- *         """
- *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
- */
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_9add_transpose(PyObject *__pyx_self, PyObject *__pyx_v_self); /*proto*/
-static char __pyx_doc_11bfs_wrapper_5Graph_8add_transpose[] = "\n        Compute the transposed adjacency list from the edge list and add it to the existing graph.\n        ";
-static PyMethodDef __pyx_mdef_11bfs_wrapper_5Graph_9add_transpose = {"add_transpose", (PyCFunction)__pyx_pw_11bfs_wrapper_5Graph_9add_transpose, METH_O, __pyx_doc_11bfs_wrapper_5Graph_8add_transpose};
-static PyObject *__pyx_pw_11bfs_wrapper_5Graph_9add_transpose(PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("add_transpose (wrapper)", 0);
-  __pyx_r = __pyx_pf_11bfs_wrapper_5Graph_8add_transpose(__pyx_self, ((PyObject *)__pyx_v_self));
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_5Graph_8add_transpose(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_self) {
-  uintptr_t __pyx_v_graph;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  uintptr_t __pyx_t_2;
-  __Pyx_RefNannySetupContext("add_transpose", 0);
-
-  /* "bfs_wrapper.pyx":149
- *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
- *         """
- *         cdef uintptr_t graph = self.graph_ptr             # <<<<<<<<<<<<<<
- *         gdf_add_transpose(<gdf_graph*>graph)
- * 
- */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_self, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 149, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 149, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_graph = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":150
- *         """
- *         cdef uintptr_t graph = self.graph_ptr
- *         gdf_add_transpose(<gdf_graph*>graph)             # <<<<<<<<<<<<<<
- * 
- * cpdef bfs(G, start, directed=True):
- */
-  (void)(gdf_add_transpose(((struct gdf_graph *)__pyx_v_graph)));
-
-  /* "bfs_wrapper.pyx":145
- * 
- * 
- *     def add_transpose(self):             # <<<<<<<<<<<<<<
- *         """
- *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
- */
-
-  /* function exit code */
-  __pyx_r = Py_None; __Pyx_INCREF(Py_None);
-  goto __pyx_L0;
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("bfs_wrapper.Graph.add_transpose", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* "bfs_wrapper.pyx":152
- *         gdf_add_transpose(<gdf_graph*>graph)
- * 
- * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
- *     """
- *     Find the distances and predecessors for a breadth first traversal of a graph.
- */
-
-static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static PyObject *__pyx_f_11bfs_wrapper_bfs(PyObject *__pyx_v_G, PyObject *__pyx_v_start, CYTHON_UNUSED int __pyx_skip_dispatch, struct __pyx_opt_args_11bfs_wrapper_bfs *__pyx_optional_args) {
-  PyObject *__pyx_v_directed = ((PyObject *)Py_True);
-  uintptr_t __pyx_v_graph;
-  struct gdf_graph *__pyx_v_g;
-  gdf_size_type __pyx_v_num_verts;
-  PyObject *__pyx_v_distances = NULL;
-  uintptr_t __pyx_v_distances_ptr;
-  PyObject *__pyx_v_predecessors = NULL;
-  uintptr_t __pyx_v_predecessors_ptr;
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  uintptr_t __pyx_t_2;
-  PyObject *__pyx_t_3 = NULL;
-  PyObject *__pyx_t_4 = NULL;
-  PyObject *__pyx_t_5 = NULL;
-  PyObject *__pyx_t_6 = NULL;
-  PyObject *__pyx_t_7 = NULL;
-  PyObject *__pyx_t_8 = NULL;
-  int __pyx_t_9;
-  bool __pyx_t_10;
-  __Pyx_RefNannySetupContext("bfs", 0);
-  if (__pyx_optional_args) {
-    if (__pyx_optional_args->__pyx_n > 0) {
-      __pyx_v_directed = __pyx_optional_args->directed;
-    }
-  }
-
-  /* "bfs_wrapper.pyx":184
- *     """
- * 
- *     cdef uintptr_t graph = G.graph_ptr             # <<<<<<<<<<<<<<
- *     cdef gdf_graph* g = <gdf_graph*>graph
- *     num_verts = g.adjList.offsets.size - 1
- */
-  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_G, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 184, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 184, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_graph = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":185
- * 
- *     cdef uintptr_t graph = G.graph_ptr
- *     cdef gdf_graph* g = <gdf_graph*>graph             # <<<<<<<<<<<<<<
- *     num_verts = g.adjList.offsets.size - 1
- *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- */
-  __pyx_v_g = ((struct gdf_graph *)__pyx_v_graph);
-
-  /* "bfs_wrapper.pyx":186
- *     cdef uintptr_t graph = G.graph_ptr
- *     cdef gdf_graph* g = <gdf_graph*>graph
- *     num_verts = g.adjList.offsets.size - 1             # <<<<<<<<<<<<<<
- *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- *     cdef uintptr_t distances_ptr = create_column(distances)
- */
-  __pyx_v_num_verts = (__pyx_v_g->adjList->offsets->size - 1);
-
-  /* "bfs_wrapper.pyx":187
- *     cdef gdf_graph* g = <gdf_graph*>graph
- *     num_verts = g.adjList.offsets.size - 1
- *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))             # <<<<<<<<<<<<<<
- *     cdef uintptr_t distances_ptr = create_column(distances)
- *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- */
-  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_Series); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_3, __pyx_n_s_zeros); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_5);
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __pyx_t_3 = __Pyx_PyInt_FromSize_t(__pyx_v_num_verts); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_6 = PyTuple_New(1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_6);
-  __Pyx_GIVEREF(__pyx_t_3);
-  PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_3);
-  __pyx_t_3 = 0;
-  __pyx_t_3 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __pyx_t_7 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_7);
-  __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_7, __pyx_n_s_int32); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_8);
-  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-  if (PyDict_SetItem(__pyx_t_3, __pyx_n_s_dtype, __pyx_t_8) < 0) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-  __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_6, __pyx_t_3); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 187, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_8);
-  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __pyx_t_3 = NULL;
-  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_4))) {
-    __pyx_t_3 = PyMethod_GET_SELF(__pyx_t_4);
-    if (likely(__pyx_t_3)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_4);
-      __Pyx_INCREF(__pyx_t_3);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_4, function);
-    }
-  }
-  if (!__pyx_t_3) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_4, __pyx_t_8); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_4)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_t_8};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_4)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_3, __pyx_t_8};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_4, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_3); __pyx_t_3 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-    } else
-    #endif
-    {
-      __pyx_t_6 = PyTuple_New(1+1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 187, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_6);
-      __Pyx_GIVEREF(__pyx_t_3); PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_t_3); __pyx_t_3 = NULL;
-      __Pyx_GIVEREF(__pyx_t_8);
-      PyTuple_SET_ITEM(__pyx_t_6, 0+1, __pyx_t_8);
-      __pyx_t_8 = 0;
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_t_6, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 187, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_v_distances = __pyx_t_1;
-  __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":188
- *     num_verts = g.adjList.offsets.size - 1
- *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- *     cdef uintptr_t distances_ptr = create_column(distances)             # <<<<<<<<<<<<<<
- *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- *     cdef uintptr_t predecessors_ptr = create_column(distances)
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_distances); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 188, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 188, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_distances_ptr = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":189
- *     distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- *     cdef uintptr_t distances_ptr = create_column(distances)
- *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))             # <<<<<<<<<<<<<<
- *     cdef uintptr_t predecessors_ptr = create_column(distances)
- * 
- */
-  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_cudf); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_Series); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_6);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_8);
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_PyInt_FromSize_t(__pyx_v_num_verts); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_GIVEREF(__pyx_t_4);
-  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4);
-  __pyx_t_4 = 0;
-  __pyx_t_4 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_4);
-  __pyx_t_5 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_5);
-  __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_t_5, __pyx_n_s_int32); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_7);
-  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
-  if (PyDict_SetItem(__pyx_t_4, __pyx_n_s_dtype, __pyx_t_7) < 0) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-  __pyx_t_7 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_3, __pyx_t_4); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 189, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_7);
-  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
-  __pyx_t_4 = NULL;
-  if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) {
-    __pyx_t_4 = PyMethod_GET_SELF(__pyx_t_6);
-    if (likely(__pyx_t_4)) {
-      PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6);
-      __Pyx_INCREF(__pyx_t_4);
-      __Pyx_INCREF(function);
-      __Pyx_DECREF_SET(__pyx_t_6, function);
-    }
-  }
-  if (!__pyx_t_4) {
-    __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
-    __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-    __Pyx_GOTREF(__pyx_t_1);
-  } else {
-    #if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(__pyx_t_6)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_7};
-      __pyx_t_1 = __Pyx_PyFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-    } else
-    #endif
-    #if CYTHON_FAST_PYCCALL
-    if (__Pyx_PyFastCFunction_Check(__pyx_t_6)) {
-      PyObject *__pyx_temp[2] = {__pyx_t_4, __pyx_t_7};
-      __pyx_t_1 = __Pyx_PyCFunction_FastCall(__pyx_t_6, __pyx_temp+1-1, 1+1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
-      __Pyx_XDECREF(__pyx_t_4); __pyx_t_4 = 0;
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
-    } else
-    #endif
-    {
-      __pyx_t_3 = PyTuple_New(1+1); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 189, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_3);
-      __Pyx_GIVEREF(__pyx_t_4); PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_4); __pyx_t_4 = NULL;
-      __Pyx_GIVEREF(__pyx_t_7);
-      PyTuple_SET_ITEM(__pyx_t_3, 0+1, __pyx_t_7);
-      __pyx_t_7 = 0;
-      __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_6, __pyx_t_3, NULL); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error)
-      __Pyx_GOTREF(__pyx_t_1);
-      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-    }
-  }
-  __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
-  __pyx_v_predecessors = __pyx_t_1;
-  __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":190
- *     cdef uintptr_t distances_ptr = create_column(distances)
- *     predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
- *     cdef uintptr_t predecessors_ptr = create_column(distances)             # <<<<<<<<<<<<<<
- * 
- *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
- */
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_create_column(__pyx_v_distances); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 190, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_2 = __Pyx_PyInt_As_size_t(__pyx_t_1); if (unlikely((__pyx_t_2 == ((uintptr_t)-1)) && PyErr_Occurred())) __PYX_ERR(0, 190, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_v_predecessors_ptr = __pyx_t_2;
-
-  /* "bfs_wrapper.pyx":192
- *     cdef uintptr_t predecessors_ptr = create_column(distances)
- * 
- *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)             # <<<<<<<<<<<<<<
- *     return distances, predecessors
- */
-  __pyx_t_9 = __Pyx_PyInt_As_int(__pyx_v_start); if (unlikely((__pyx_t_9 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 192, __pyx_L1_error)
-  __pyx_t_10 = __Pyx_PyObject_IsTrue(__pyx_v_directed); if (unlikely((__pyx_t_10 == ((bool)-1)) && PyErr_Occurred())) __PYX_ERR(0, 192, __pyx_L1_error)
-  (void)(gdf_bfs(((struct gdf_graph *)__pyx_v_g), ((gdf_column *)__pyx_v_distances_ptr), ((gdf_column *)__pyx_v_predecessors_ptr), ((int)__pyx_t_9), ((bool)__pyx_t_10)));
-
-  /* "bfs_wrapper.pyx":193
- * 
- *     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
- *     return distances, predecessors             # <<<<<<<<<<<<<<
- */
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 193, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_INCREF(__pyx_v_distances);
-  __Pyx_GIVEREF(__pyx_v_distances);
-  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_distances);
-  __Pyx_INCREF(__pyx_v_predecessors);
-  __Pyx_GIVEREF(__pyx_v_predecessors);
-  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_v_predecessors);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* "bfs_wrapper.pyx":152
- *         gdf_add_transpose(<gdf_graph*>graph)
- * 
- * cpdef bfs(G, start, directed=True):             # <<<<<<<<<<<<<<
- *     """
- *     Find the distances and predecessors for a breadth first traversal of a graph.
- */
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_3);
-  __Pyx_XDECREF(__pyx_t_4);
-  __Pyx_XDECREF(__pyx_t_5);
-  __Pyx_XDECREF(__pyx_t_6);
-  __Pyx_XDECREF(__pyx_t_7);
-  __Pyx_XDECREF(__pyx_t_8);
-  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = 0;
-  __pyx_L0:;
-  __Pyx_XDECREF(__pyx_v_distances);
-  __Pyx_XDECREF(__pyx_v_predecessors);
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-/* Python wrapper */
-static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
-static char __pyx_doc_11bfs_wrapper_6bfs[] = "\n    Find the distances and predecessors for a breadth first traversal of a graph.\n    \n    Parameters\n    ----------\n    G : cugraph.graph\n        cuGraph graph descriptor, should contain the connectivity information as an\n        adjacency list.\n    start : Integer\n        The index of the graph vertex from which the traversal begins\n    directed : bool\n        Indicates whether the graph in question is a directed graph, or whether\n        each edge has a corresponding reverse edge. (Allows optimizations if the\n        graph is undirected)\n    \n    Returns\n    -------\n    distances, predecessors : cudf.Series\n        distances gives the path distance for each vertex from the starting vertex\n        predecessors gives for each vertex the vertex it was reached from in the traversal\n        \n    Examples\n    --------\n    >>> M = ReadMtxFile(graph_file)\n    >>> sources = cudf.Series(M.row)\n    >>> destinations = cudf.Series(M.col)\n    >>> G = cuGraph.Graph()\n    >>> G.add_edge_list(sources,destinations,none)\n    >>> dist, pred = cuGraph.bfs(G, 0, false)\n    ";
-static PyObject *__pyx_pw_11bfs_wrapper_7bfs(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
-  PyObject *__pyx_v_G = 0;
-  PyObject *__pyx_v_start = 0;
-  PyObject *__pyx_v_directed = 0;
-  PyObject *__pyx_r = 0;
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("bfs (wrapper)", 0);
-  {
-    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_G,&__pyx_n_s_start,&__pyx_n_s_directed,0};
-    PyObject* values[3] = {0,0,0};
-    values[2] = ((PyObject *)Py_True);
-    if (unlikely(__pyx_kwds)) {
-      Py_ssize_t kw_args;
-      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
-      switch (pos_args) {
-        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-        CYTHON_FALLTHROUGH;
-        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-        CYTHON_FALLTHROUGH;
-        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-        CYTHON_FALLTHROUGH;
-        case  0: break;
-        default: goto __pyx_L5_argtuple_error;
-      }
-      kw_args = PyDict_Size(__pyx_kwds);
-      switch (pos_args) {
-        case  0:
-        if (likely((values[0] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_G)) != 0)) kw_args--;
-        else goto __pyx_L5_argtuple_error;
-        CYTHON_FALLTHROUGH;
-        case  1:
-        if (likely((values[1] = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--;
-        else {
-          __Pyx_RaiseArgtupleInvalid("bfs", 0, 2, 3, 1); __PYX_ERR(0, 152, __pyx_L3_error)
-        }
-        CYTHON_FALLTHROUGH;
-        case  2:
-        if (kw_args > 0) {
-          PyObject* value = __Pyx_PyDict_GetItemStr(__pyx_kwds, __pyx_n_s_directed);
-          if (value) { values[2] = value; kw_args--; }
-        }
-      }
-      if (unlikely(kw_args > 0)) {
-        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "bfs") < 0)) __PYX_ERR(0, 152, __pyx_L3_error)
-      }
-    } else {
-      switch (PyTuple_GET_SIZE(__pyx_args)) {
-        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
-        CYTHON_FALLTHROUGH;
-        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
-        values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
-        break;
-        default: goto __pyx_L5_argtuple_error;
-      }
-    }
-    __pyx_v_G = values[0];
-    __pyx_v_start = values[1];
-    __pyx_v_directed = values[2];
-  }
-  goto __pyx_L4_argument_unpacking_done;
-  __pyx_L5_argtuple_error:;
-  __Pyx_RaiseArgtupleInvalid("bfs", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); __PYX_ERR(0, 152, __pyx_L3_error)
-  __pyx_L3_error:;
-  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __Pyx_RefNannyFinishContext();
-  return NULL;
-  __pyx_L4_argument_unpacking_done:;
-  __pyx_r = __pyx_pf_11bfs_wrapper_6bfs(__pyx_self, __pyx_v_G, __pyx_v_start, __pyx_v_directed);
-
-  /* function exit code */
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyObject *__pyx_pf_11bfs_wrapper_6bfs(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_G, PyObject *__pyx_v_start, PyObject *__pyx_v_directed) {
-  PyObject *__pyx_r = NULL;
-  __Pyx_RefNannyDeclarations
-  PyObject *__pyx_t_1 = NULL;
-  struct __pyx_opt_args_11bfs_wrapper_bfs __pyx_t_2;
-  __Pyx_RefNannySetupContext("bfs", 0);
-  __Pyx_XDECREF(__pyx_r);
-  __pyx_t_2.__pyx_n = 1;
-  __pyx_t_2.directed = __pyx_v_directed;
-  __pyx_t_1 = __pyx_f_11bfs_wrapper_bfs(__pyx_v_G, __pyx_v_start, 0, &__pyx_t_2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 152, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_r = __pyx_t_1;
-  __pyx_t_1 = 0;
-  goto __pyx_L0;
-
-  /* function exit code */
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_AddTraceback("bfs_wrapper.bfs", __pyx_clineno, __pyx_lineno, __pyx_filename);
-  __pyx_r = NULL;
-  __pyx_L0:;
-  __Pyx_XGIVEREF(__pyx_r);
-  __Pyx_RefNannyFinishContext();
-  return __pyx_r;
-}
-
-static PyMethodDef __pyx_methods[] = {
-  {"bfs", (PyCFunction)__pyx_pw_11bfs_wrapper_7bfs, METH_VARARGS|METH_KEYWORDS, __pyx_doc_11bfs_wrapper_6bfs},
-  {0, 0, 0, 0}
-};
-
-#if PY_MAJOR_VERSION >= 3
-#if CYTHON_PEP489_MULTI_PHASE_INIT
-static PyObject* __pyx_pymod_create(PyObject *spec, PyModuleDef *def); /*proto*/
-static int __pyx_pymod_exec_bfs_wrapper(PyObject* module); /*proto*/
-static PyModuleDef_Slot __pyx_moduledef_slots[] = {
-  {Py_mod_create, (void*)__pyx_pymod_create},
-  {Py_mod_exec, (void*)__pyx_pymod_exec_bfs_wrapper},
-  {0, NULL}
-};
-#endif
-
-static struct PyModuleDef __pyx_moduledef = {
-    PyModuleDef_HEAD_INIT,
-    "bfs_wrapper",
-    0, /* m_doc */
-  #if CYTHON_PEP489_MULTI_PHASE_INIT
-    0, /* m_size */
-  #else
-    -1, /* m_size */
-  #endif
-    __pyx_methods /* m_methods */,
-  #if CYTHON_PEP489_MULTI_PHASE_INIT
-    __pyx_moduledef_slots, /* m_slots */
-  #else
-    NULL, /* m_reload */
-  #endif
-    NULL, /* m_traverse */
-    NULL, /* m_clear */
-    NULL /* m_free */
-};
-#endif
-
-static __Pyx_StringTabEntry __pyx_string_tab[] = {
-  {&__pyx_kp_u_Find_the_distances_and_predeces, __pyx_k_Find_the_distances_and_predeces, sizeof(__pyx_k_Find_the_distances_and_predeces), 0, 1, 0, 0},
-  {&__pyx_n_s_G, __pyx_k_G, sizeof(__pyx_k_G), 0, 0, 1, 1},
-  {&__pyx_n_s_Graph, __pyx_k_Graph, sizeof(__pyx_k_Graph), 0, 0, 1, 1},
-  {&__pyx_n_s_Graph___init, __pyx_k_Graph___init, sizeof(__pyx_k_Graph___init), 0, 0, 1, 1},
-  {&__pyx_kp_u_Graph___init___line_48, __pyx_k_Graph___init___line_48, sizeof(__pyx_k_Graph___init___line_48), 0, 1, 0, 0},
-  {&__pyx_n_s_Graph_add_adj_list, __pyx_k_Graph_add_adj_list, sizeof(__pyx_k_Graph_add_adj_list), 0, 0, 1, 1},
-  {&__pyx_n_s_Graph_add_edge_list, __pyx_k_Graph_add_edge_list, sizeof(__pyx_k_Graph_add_edge_list), 0, 0, 1, 1},
-  {&__pyx_kp_u_Graph_add_edge_list_line_66, __pyx_k_Graph_add_edge_list_line_66, sizeof(__pyx_k_Graph_add_edge_list_line_66), 0, 1, 0, 0},
-  {&__pyx_n_s_Graph_add_transpose, __pyx_k_Graph_add_transpose, sizeof(__pyx_k_Graph_add_transpose), 0, 0, 1, 1},
-  {&__pyx_n_s_Graph_view_edge_list, __pyx_k_Graph_view_edge_list, sizeof(__pyx_k_Graph_view_edge_list), 0, 0, 1, 1},
-  {&__pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa, __pyx_k_Returns_Graph_cuGraph_Graph_Exa, sizeof(__pyx_k_Returns_Graph_cuGraph_Graph_Exa), 0, 1, 0, 0},
-  {&__pyx_n_s_Series, __pyx_k_Series, sizeof(__pyx_k_Series), 0, 0, 1, 1},
-  {&__pyx_kp_u_Warp_existing_gdf_columns_repre, __pyx_k_Warp_existing_gdf_columns_repre, sizeof(__pyx_k_Warp_existing_gdf_columns_repre), 0, 1, 0, 0},
-  {&__pyx_n_s_add_adj_list, __pyx_k_add_adj_list, sizeof(__pyx_k_add_adj_list), 0, 0, 1, 1},
-  {&__pyx_n_s_add_edge_list, __pyx_k_add_edge_list, sizeof(__pyx_k_add_edge_list), 0, 0, 1, 1},
-  {&__pyx_n_s_add_transpose, __pyx_k_add_transpose, sizeof(__pyx_k_add_transpose), 0, 0, 1, 1},
-  {&__pyx_kp_u_bfs_line_152, __pyx_k_bfs_line_152, sizeof(__pyx_k_bfs_line_152), 0, 1, 0, 0},
-  {&__pyx_n_s_bfs_wrapper, __pyx_k_bfs_wrapper, sizeof(__pyx_k_bfs_wrapper), 0, 0, 1, 1},
-  {&__pyx_n_s_cffi_view, __pyx_k_cffi_view, sizeof(__pyx_k_cffi_view), 0, 0, 1, 1},
-  {&__pyx_n_s_cffi_view_to_column_mem, __pyx_k_cffi_view_to_column_mem, sizeof(__pyx_k_cffi_view_to_column_mem), 0, 0, 1, 1},
-  {&__pyx_n_s_cline_in_traceback, __pyx_k_cline_in_traceback, sizeof(__pyx_k_cline_in_traceback), 0, 0, 1, 1},
-  {&__pyx_n_s_column, __pyx_k_column, sizeof(__pyx_k_column), 0, 0, 1, 1},
-  {&__pyx_kp_s_cuGraph_graph_class_containing, __pyx_k_cuGraph_graph_class_containing, sizeof(__pyx_k_cuGraph_graph_class_containing), 0, 0, 1, 0},
-  {&__pyx_n_s_cudf, __pyx_k_cudf, sizeof(__pyx_k_cudf), 0, 0, 1, 1},
-  {&__pyx_n_s_data, __pyx_k_data, sizeof(__pyx_k_data), 0, 0, 1, 1},
-  {&__pyx_n_s_data_2, __pyx_k_data_2, sizeof(__pyx_k_data_2), 0, 0, 1, 1},
-  {&__pyx_n_s_dest, __pyx_k_dest, sizeof(__pyx_k_dest), 0, 0, 1, 1},
-  {&__pyx_n_s_dest_col, __pyx_k_dest_col, sizeof(__pyx_k_dest_col), 0, 0, 1, 1},
-  {&__pyx_n_s_device_ctypes_pointer, __pyx_k_device_ctypes_pointer, sizeof(__pyx_k_device_ctypes_pointer), 0, 0, 1, 1},
-  {&__pyx_n_s_directed, __pyx_k_directed, sizeof(__pyx_k_directed), 0, 0, 1, 1},
-  {&__pyx_n_s_doc, __pyx_k_doc, sizeof(__pyx_k_doc), 0, 0, 1, 1},
-  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
-  {&__pyx_n_s_dtypes, __pyx_k_dtypes, sizeof(__pyx_k_dtypes), 0, 0, 1, 1},
-  {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1},
-  {&__pyx_n_s_file, __pyx_k_file, sizeof(__pyx_k_file), 0, 0, 1, 1},
-  {&__pyx_n_s_float32, __pyx_k_float32, sizeof(__pyx_k_float32), 0, 0, 1, 1},
-  {&__pyx_n_s_float64, __pyx_k_float64, sizeof(__pyx_k_float64), 0, 0, 1, 1},
-  {&__pyx_n_s_g, __pyx_k_g, sizeof(__pyx_k_g), 0, 0, 1, 1},
-  {&__pyx_n_s_gdf, __pyx_k_gdf, sizeof(__pyx_k_gdf), 0, 0, 1, 1},
-  {&__pyx_n_s_get_column_data_ptr, __pyx_k_get_column_data_ptr, sizeof(__pyx_k_get_column_data_ptr), 0, 0, 1, 1},
-  {&__pyx_n_s_get_column_valid_ptr, __pyx_k_get_column_valid_ptr, sizeof(__pyx_k_get_column_valid_ptr), 0, 0, 1, 1},
-  {&__pyx_n_s_get_ctype_ptr, __pyx_k_get_ctype_ptr, sizeof(__pyx_k_get_ctype_ptr), 0, 0, 1, 1},
-  {&__pyx_n_s_graph, __pyx_k_graph, sizeof(__pyx_k_graph), 0, 0, 1, 1},
-  {&__pyx_n_s_graph_ptr, __pyx_k_graph_ptr, sizeof(__pyx_k_graph_ptr), 0, 0, 1, 1},
-  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
-  {&__pyx_n_s_indices, __pyx_k_indices, sizeof(__pyx_k_indices), 0, 0, 1, 1},
-  {&__pyx_n_s_indices_col, __pyx_k_indices_col, sizeof(__pyx_k_indices_col), 0, 0, 1, 1},
-  {&__pyx_n_s_init, __pyx_k_init, sizeof(__pyx_k_init), 0, 0, 1, 1},
-  {&__pyx_n_s_int32, __pyx_k_int32, sizeof(__pyx_k_int32), 0, 0, 1, 1},
-  {&__pyx_n_s_int64, __pyx_k_int64, sizeof(__pyx_k_int64), 0, 0, 1, 1},
-  {&__pyx_n_s_librmm, __pyx_k_librmm, sizeof(__pyx_k_librmm), 0, 0, 1, 1},
-  {&__pyx_n_s_librmm_cffi, __pyx_k_librmm_cffi, sizeof(__pyx_k_librmm_cffi), 0, 0, 1, 1},
-  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
-  {&__pyx_n_s_mask, __pyx_k_mask, sizeof(__pyx_k_mask), 0, 0, 1, 1},
-  {&__pyx_n_s_metaclass, __pyx_k_metaclass, sizeof(__pyx_k_metaclass), 0, 0, 1, 1},
-  {&__pyx_n_s_module, __pyx_k_module, sizeof(__pyx_k_module), 0, 0, 1, 1},
-  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
-  {&__pyx_n_s_null_count, __pyx_k_null_count, sizeof(__pyx_k_null_count), 0, 0, 1, 1},
-  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
-  {&__pyx_n_s_obj, __pyx_k_obj, sizeof(__pyx_k_obj), 0, 0, 1, 1},
-  {&__pyx_n_s_offsets, __pyx_k_offsets, sizeof(__pyx_k_offsets), 0, 0, 1, 1},
-  {&__pyx_n_s_offsets_col, __pyx_k_offsets_col, sizeof(__pyx_k_offsets_col), 0, 0, 1, 1},
-  {&__pyx_n_s_prepare, __pyx_k_prepare, sizeof(__pyx_k_prepare), 0, 0, 1, 1},
-  {&__pyx_n_s_print, __pyx_k_print, sizeof(__pyx_k_print), 0, 0, 1, 1},
-  {&__pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_k_python_bfs_bfs_wrapper_pyx, sizeof(__pyx_k_python_bfs_bfs_wrapper_pyx), 0, 0, 1, 0},
-  {&__pyx_n_s_qualname, __pyx_k_qualname, sizeof(__pyx_k_qualname), 0, 0, 1, 1},
-  {&__pyx_n_s_rmm, __pyx_k_rmm, sizeof(__pyx_k_rmm), 0, 0, 1, 1},
-  {&__pyx_n_s_self, __pyx_k_self, sizeof(__pyx_k_self), 0, 0, 1, 1},
-  {&__pyx_n_s_size, __pyx_k_size, sizeof(__pyx_k_size), 0, 0, 1, 1},
-  {&__pyx_n_s_source, __pyx_k_source, sizeof(__pyx_k_source), 0, 0, 1, 1},
-  {&__pyx_n_s_source_col, __pyx_k_source_col, sizeof(__pyx_k_source_col), 0, 0, 1, 1},
-  {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1},
-  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
-  {&__pyx_n_s_to_gpu_array, __pyx_k_to_gpu_array, sizeof(__pyx_k_to_gpu_array), 0, 0, 1, 1},
-  {&__pyx_n_s_type, __pyx_k_type, sizeof(__pyx_k_type), 0, 0, 1, 1},
-  {&__pyx_n_s_value, __pyx_k_value, sizeof(__pyx_k_value), 0, 0, 1, 1},
-  {&__pyx_n_s_value_col, __pyx_k_value_col, sizeof(__pyx_k_value_col), 0, 0, 1, 1},
-  {&__pyx_n_s_view_edge_list, __pyx_k_view_edge_list, sizeof(__pyx_k_view_edge_list), 0, 0, 1, 1},
-  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
-  {0, 0, 0, 0, 0, 0, 0}
-};
-static int __Pyx_InitCachedBuiltins(void) {
-  return 0;
-}
-
-static int __Pyx_InitCachedConstants(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
-
-  /* "bfs_wrapper.pyx":12
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
- * 
- * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
- *     # The manner to access the pointers in the gdf's might change, so
- *     # encapsulating access in the following 3 methods. They might also be
- */
-  __pyx_tuple_ = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple_)) __PYX_ERR(0, 12, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple_);
-  __Pyx_GIVEREF(__pyx_tuple_);
-  __pyx_codeobj__2 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple_, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_ctype_ptr, 12, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__2)) __PYX_ERR(0, 12, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":18
- *     return obj.device_ctypes_pointer.value
- * 
- * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- */
-  __pyx_tuple__3 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__3)) __PYX_ERR(0, 18, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__3);
-  __Pyx_GIVEREF(__pyx_tuple__3);
-  __pyx_codeobj__4 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__3, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_column_data_ptr, 18, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__4)) __PYX_ERR(0, 18, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":21
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
- * 
- */
-  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_n_s_obj); if (unlikely(!__pyx_tuple__5)) __PYX_ERR(0, 21, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__5);
-  __Pyx_GIVEREF(__pyx_tuple__5);
-  __pyx_codeobj__6 = (PyObject*)__Pyx_PyCode_New(1, 0, 1, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__5, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_get_column_valid_ptr, 21, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__6)) __PYX_ERR(0, 21, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":48
- *         cuGraph graph class containing basic graph creation and transformation operations.
- *     """
- *     def __init__(self):             # <<<<<<<<<<<<<<
- *         """
- *         Returns
- */
-  __pyx_tuple__7 = PyTuple_Pack(3, __pyx_n_s_self, __pyx_n_s_graph, __pyx_n_s_graph_ptr); if (unlikely(!__pyx_tuple__7)) __PYX_ERR(0, 48, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__7);
-  __Pyx_GIVEREF(__pyx_tuple__7);
-  __pyx_codeobj__8 = (PyObject*)__Pyx_PyCode_New(1, 0, 3, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__7, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_init, 48, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__8)) __PYX_ERR(0, 48, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":66
- * 
- * 
- *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
- */
-  __pyx_tuple__9 = PyTuple_Pack(8, __pyx_n_s_self, __pyx_n_s_source_col, __pyx_n_s_dest_col, __pyx_n_s_value_col, __pyx_n_s_graph, __pyx_n_s_source, __pyx_n_s_dest, __pyx_n_s_value); if (unlikely(!__pyx_tuple__9)) __PYX_ERR(0, 66, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__9);
-  __Pyx_GIVEREF(__pyx_tuple__9);
-  __pyx_codeobj__10 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__9, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_edge_list, 66, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__10)) __PYX_ERR(0, 66, __pyx_L1_error)
-  __pyx_tuple__11 = PyTuple_Pack(1, ((PyObject *)Py_None)); if (unlikely(!__pyx_tuple__11)) __PYX_ERR(0, 66, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__11);
-  __Pyx_GIVEREF(__pyx_tuple__11);
-
-  /* "bfs_wrapper.pyx":111
- *                        <gdf_column*>value)
- * 
- *     def view_edge_list(self):             # <<<<<<<<<<<<<<
- *         ##TO DO
- *         """
- */
-  __pyx_tuple__12 = PyTuple_Pack(6, __pyx_n_s_self, __pyx_n_s_graph, __pyx_n_s_g, __pyx_n_s_size, __pyx_n_s_cffi_view, __pyx_n_s_data_2); if (unlikely(!__pyx_tuple__12)) __PYX_ERR(0, 111, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__12);
-  __Pyx_GIVEREF(__pyx_tuple__12);
-  __pyx_codeobj__13 = (PyObject*)__Pyx_PyCode_New(1, 0, 6, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__12, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_view_edge_list, 111, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__13)) __PYX_ERR(0, 111, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":125
- *         return 0
- * 
- *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
- */
-  __pyx_tuple__14 = PyTuple_Pack(8, __pyx_n_s_self, __pyx_n_s_offsets_col, __pyx_n_s_indices_col, __pyx_n_s_value_col, __pyx_n_s_graph, __pyx_n_s_offsets, __pyx_n_s_indices, __pyx_n_s_value); if (unlikely(!__pyx_tuple__14)) __PYX_ERR(0, 125, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__14);
-  __Pyx_GIVEREF(__pyx_tuple__14);
-  __pyx_codeobj__15 = (PyObject*)__Pyx_PyCode_New(4, 0, 8, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__14, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_adj_list, 125, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__15)) __PYX_ERR(0, 125, __pyx_L1_error)
-
-  /* "bfs_wrapper.pyx":145
- * 
- * 
- *     def add_transpose(self):             # <<<<<<<<<<<<<<
- *         """
- *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
- */
-  __pyx_tuple__16 = PyTuple_Pack(2, __pyx_n_s_self, __pyx_n_s_graph); if (unlikely(!__pyx_tuple__16)) __PYX_ERR(0, 145, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_tuple__16);
-  __Pyx_GIVEREF(__pyx_tuple__16);
-  __pyx_codeobj__17 = (PyObject*)__Pyx_PyCode_New(1, 0, 2, 0, CO_OPTIMIZED|CO_NEWLOCALS, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__16, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_python_bfs_bfs_wrapper_pyx, __pyx_n_s_add_transpose, 145, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__17)) __PYX_ERR(0, 145, __pyx_L1_error)
-  __Pyx_RefNannyFinishContext();
-  return 0;
-  __pyx_L1_error:;
-  __Pyx_RefNannyFinishContext();
-  return -1;
-}
-
-static int __Pyx_InitGlobals(void) {
-  if (__Pyx_InitStrings(__pyx_string_tab) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
-  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) __PYX_ERR(0, 1, __pyx_L1_error)
-  return 0;
-  __pyx_L1_error:;
-  return -1;
-}
-
-static int __Pyx_modinit_global_init_code(void); /*proto*/
-static int __Pyx_modinit_variable_export_code(void); /*proto*/
-static int __Pyx_modinit_function_export_code(void); /*proto*/
-static int __Pyx_modinit_type_init_code(void); /*proto*/
-static int __Pyx_modinit_type_import_code(void); /*proto*/
-static int __Pyx_modinit_variable_import_code(void); /*proto*/
-static int __Pyx_modinit_function_import_code(void); /*proto*/
-
-static int __Pyx_modinit_global_init_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_global_init_code", 0);
-  /*--- Global init code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_variable_export_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_export_code", 0);
-  /*--- Variable export code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_function_export_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_function_export_code", 0);
-  /*--- Function export code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_type_init_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_type_init_code", 0);
-  /*--- Type init code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_type_import_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_type_import_code", 0);
-  /*--- Type import code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_variable_import_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_variable_import_code", 0);
-  /*--- Variable import code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-static int __Pyx_modinit_function_import_code(void) {
-  __Pyx_RefNannyDeclarations
-  __Pyx_RefNannySetupContext("__Pyx_modinit_function_import_code", 0);
-  /*--- Function import code ---*/
-  __Pyx_RefNannyFinishContext();
-  return 0;
-}
-
-
-#if PY_MAJOR_VERSION < 3
-#ifdef CYTHON_NO_PYINIT_EXPORT
-#define __Pyx_PyMODINIT_FUNC void
-#else
-#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
-#endif
-#else
-#ifdef CYTHON_NO_PYINIT_EXPORT
-#define __Pyx_PyMODINIT_FUNC PyObject *
-#else
-#define __Pyx_PyMODINIT_FUNC PyMODINIT_FUNC
-#endif
-#endif
-#ifndef CYTHON_SMALL_CODE
-#if defined(__clang__)
-    #define CYTHON_SMALL_CODE
-#elif defined(__GNUC__) && (!(defined(__cplusplus)) || (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 4)))
-    #define CYTHON_SMALL_CODE __attribute__((cold))
-#else
-    #define CYTHON_SMALL_CODE
-#endif
-#endif
-
-
-#if PY_MAJOR_VERSION < 3
-__Pyx_PyMODINIT_FUNC initbfs_wrapper(void) CYTHON_SMALL_CODE; /*proto*/
-__Pyx_PyMODINIT_FUNC initbfs_wrapper(void)
-#else
-__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void) CYTHON_SMALL_CODE; /*proto*/
-__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void)
-#if CYTHON_PEP489_MULTI_PHASE_INIT
-{
-  return PyModuleDef_Init(&__pyx_moduledef);
-}
-static int __Pyx_copy_spec_to_module(PyObject *spec, PyObject *moddict, const char* from_name, const char* to_name) {
-    PyObject *value = PyObject_GetAttrString(spec, from_name);
-    int result = 0;
-    if (likely(value)) {
-        result = PyDict_SetItemString(moddict, to_name, value);
-        Py_DECREF(value);
-    } else if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
-        PyErr_Clear();
-    } else {
-        result = -1;
-    }
-    return result;
-}
-static PyObject* __pyx_pymod_create(PyObject *spec, CYTHON_UNUSED PyModuleDef *def) {
-    PyObject *module = NULL, *moddict, *modname;
-    if (__pyx_m)
-        return __Pyx_NewRef(__pyx_m);
-    modname = PyObject_GetAttrString(spec, "name");
-    if (unlikely(!modname)) goto bad;
-    module = PyModule_NewObject(modname);
-    Py_DECREF(modname);
-    if (unlikely(!module)) goto bad;
-    moddict = PyModule_GetDict(module);
-    if (unlikely(!moddict)) goto bad;
-    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "loader", "__loader__") < 0)) goto bad;
-    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "origin", "__file__") < 0)) goto bad;
-    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "parent", "__package__") < 0)) goto bad;
-    if (unlikely(__Pyx_copy_spec_to_module(spec, moddict, "submodule_search_locations", "__path__") < 0)) goto bad;
-    return module;
-bad:
-    Py_XDECREF(module);
-    return NULL;
-}
-
-
-static int __pyx_pymod_exec_bfs_wrapper(PyObject *__pyx_pyinit_module)
-#endif
-#endif
-{
-  PyObject *__pyx_t_1 = NULL;
-  PyObject *__pyx_t_2 = NULL;
-  PyObject *__pyx_t_3 = NULL;
-  __Pyx_RefNannyDeclarations
-  #if CYTHON_PEP489_MULTI_PHASE_INIT
-  if (__pyx_m && __pyx_m == __pyx_pyinit_module) return 0;
-  #elif PY_MAJOR_VERSION >= 3
-  if (__pyx_m) return __Pyx_NewRef(__pyx_m);
-  #endif
-  #if CYTHON_REFNANNY
-__Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
-if (!__Pyx_RefNanny) {
-  PyErr_Clear();
-  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
-  if (!__Pyx_RefNanny)
-      Py_FatalError("failed to import 'refnanny' module");
-}
-#endif
-  __Pyx_RefNannySetupContext("__Pyx_PyMODINIT_FUNC PyInit_bfs_wrapper(void)", 0);
-  if (__Pyx_check_binary_version() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) __PYX_ERR(0, 1, __pyx_L1_error)
-  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) __PYX_ERR(0, 1, __pyx_L1_error)
-  __pyx_empty_unicode = PyUnicode_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_unicode)) __PYX_ERR(0, 1, __pyx_L1_error)
-  #ifdef __Pyx_CyFunction_USED
-  if (__pyx_CyFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  #ifdef __Pyx_FusedFunction_USED
-  if (__pyx_FusedFunction_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  #ifdef __Pyx_Coroutine_USED
-  if (__pyx_Coroutine_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  #ifdef __Pyx_Generator_USED
-  if (__pyx_Generator_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  #ifdef __Pyx_AsyncGen_USED
-  if (__pyx_AsyncGen_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  #ifdef __Pyx_StopAsyncIteration_USED
-  if (__pyx_StopAsyncIteration_init() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  /*--- Library function declarations ---*/
-  /*--- Threads initialization code ---*/
-  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
-  #ifdef WITH_THREAD /* Python build with threading support? */
-  PyEval_InitThreads();
-  #endif
-  #endif
-  /*--- Module creation code ---*/
-  #if CYTHON_PEP489_MULTI_PHASE_INIT
-  __pyx_m = __pyx_pyinit_module;
-  Py_INCREF(__pyx_m);
-  #else
-  #if PY_MAJOR_VERSION < 3
-  __pyx_m = Py_InitModule4("bfs_wrapper", __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
-  #else
-  __pyx_m = PyModule_Create(&__pyx_moduledef);
-  #endif
-  if (unlikely(!__pyx_m)) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) __PYX_ERR(0, 1, __pyx_L1_error)
-  Py_INCREF(__pyx_d);
-  __pyx_b = PyImport_AddModule(__Pyx_BUILTIN_MODULE_NAME); if (unlikely(!__pyx_b)) __PYX_ERR(0, 1, __pyx_L1_error)
-  __pyx_cython_runtime = PyImport_AddModule((char *) "cython_runtime"); if (unlikely(!__pyx_cython_runtime)) __PYX_ERR(0, 1, __pyx_L1_error)
-  #if CYTHON_COMPILING_IN_PYPY
-  Py_INCREF(__pyx_b);
-  #endif
-  if (PyObject_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) __PYX_ERR(0, 1, __pyx_L1_error);
-  /*--- Initialize various global constants etc. ---*/
-  if (__Pyx_InitGlobals() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
-  if (__Pyx_init_sys_getdefaultencoding_params() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-  if (__pyx_module_is_main_bfs_wrapper) {
-    if (PyObject_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  }
-  #if PY_MAJOR_VERSION >= 3
-  {
-    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) __PYX_ERR(0, 1, __pyx_L1_error)
-    if (!PyDict_GetItemString(modules, "bfs_wrapper")) {
-      if (unlikely(PyDict_SetItemString(modules, "bfs_wrapper", __pyx_m) < 0)) __PYX_ERR(0, 1, __pyx_L1_error)
-    }
-  }
-  #endif
-  /*--- Builtin init code ---*/
-  if (__Pyx_InitCachedBuiltins() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  /*--- Constants init code ---*/
-  if (__Pyx_InitCachedConstants() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  /*--- Global type/function init code ---*/
-  (void)__Pyx_modinit_global_init_code();
-  (void)__Pyx_modinit_variable_export_code();
-  (void)__Pyx_modinit_function_export_code();
-  (void)__Pyx_modinit_type_init_code();
-  (void)__Pyx_modinit_type_import_code();
-  (void)__Pyx_modinit_variable_import_code();
-  (void)__Pyx_modinit_function_import_code();
-  /*--- Execution code ---*/
-  #if defined(__Pyx_Generator_USED) || defined(__Pyx_Coroutine_USED)
-  if (__Pyx_patch_abc() < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  #endif
-
-  /* "bfs_wrapper.pyx":5
- * from libc.stdint cimport uintptr_t
- * from libc.stdlib cimport calloc, malloc, free
- * import cudf             # <<<<<<<<<<<<<<
- * from librmm_cffi import librmm as rmm
- * #from pygdf import Column
- */
-  __pyx_t_1 = __Pyx_Import(__pyx_n_s_cudf, 0, -1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 5, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cudf, __pyx_t_1) < 0) __PYX_ERR(0, 5, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":6
- * from libc.stdlib cimport calloc, malloc, free
- * import cudf
- * from librmm_cffi import librmm as rmm             # <<<<<<<<<<<<<<
- * #from pygdf import Column
- * import numpy as np
- */
-  __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_INCREF(__pyx_n_s_librmm);
-  __Pyx_GIVEREF(__pyx_n_s_librmm);
-  PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_librmm);
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_librmm_cffi, __pyx_t_1, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 6, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_librmm); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 6, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_rmm, __pyx_t_1) < 0) __PYX_ERR(0, 6, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":8
- * from librmm_cffi import librmm as rmm
- * #from pygdf import Column
- * import numpy as np             # <<<<<<<<<<<<<<
- * 
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
- */
-  __pyx_t_2 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 8, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_2) < 0) __PYX_ERR(0, 8, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":10
- * import numpy as np
- * 
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}             # <<<<<<<<<<<<<<
- * 
- * def _get_ctype_ptr(obj):
- */
-  __pyx_t_2 = __Pyx_PyDict_NewPresized(4); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_int32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_INT32); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_int64); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_INT64); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_float32); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_FLOAT32); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __pyx_t_3 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_float64); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_3);
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __pyx_t_1 = __Pyx_PyInt_From_gdf_dtype(GDF_FLOAT64); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_t_2, __pyx_t_3, __pyx_t_1) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_dtypes, __pyx_t_2) < 0) __PYX_ERR(0, 10, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":12
- * dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
- * 
- * def _get_ctype_ptr(obj):             # <<<<<<<<<<<<<<
- *     # The manner to access the pointers in the gdf's might change, so
- *     # encapsulating access in the following 3 methods. They might also be
- */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_1_get_ctype_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 12, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_ctype_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 12, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":18
- *     return obj.device_ctypes_pointer.value
- * 
- * def _get_column_data_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_3_get_column_data_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 18, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_column_data_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 18, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":21
- *     return _get_ctype_ptr(obj._column._data.to_gpu_array())
- * 
- * def _get_column_valid_ptr(obj):             # <<<<<<<<<<<<<<
- *     return _get_ctype_ptr(obj._column._mask.to_gpu_array())
- * 
- */
-  __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5_get_column_valid_ptr, NULL, __pyx_n_s_bfs_wrapper); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 21, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_get_column_valid_ptr, __pyx_t_2) < 0) __PYX_ERR(0, 21, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":44
- *     return col_ptr
- * 
- * class Graph:             # <<<<<<<<<<<<<<
- *     """
- *         cuGraph graph class containing basic graph creation and transformation operations.
- */
-  __pyx_t_2 = __Pyx_Py3MetaclassPrepare((PyObject *) NULL, __pyx_empty_tuple, __pyx_n_s_Graph, __pyx_n_s_Graph, (PyObject *) NULL, __pyx_n_s_bfs_wrapper, __pyx_kp_s_cuGraph_graph_class_containing); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 44, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-
-  /* "bfs_wrapper.pyx":48
- *         cuGraph graph class containing basic graph creation and transformation operations.
- *     """
- *     def __init__(self):             # <<<<<<<<<<<<<<
- *         """
- *         Returns
- */
-  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_1__init__, 0, __pyx_n_s_Graph___init, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__8)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 48, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_init, __pyx_t_1) < 0) __PYX_ERR(0, 48, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":66
- * 
- * 
- *     def add_edge_list(self, source_col, dest_col, value_col=None):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory.
- */
-  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_3add_edge_list, 0, __pyx_n_s_Graph_add_edge_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__10)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 66, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  __Pyx_CyFunction_SetDefaultsTuple(__pyx_t_1, __pyx_tuple__11);
-  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_edge_list, __pyx_t_1) < 0) __PYX_ERR(0, 66, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":111
- *                        <gdf_column*>value)
- * 
- *     def view_edge_list(self):             # <<<<<<<<<<<<<<
- *         ##TO DO
- *         """
- */
-  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_5view_edge_list, 0, __pyx_n_s_Graph_view_edge_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__13)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 111, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_view_edge_list, __pyx_t_1) < 0) __PYX_ERR(0, 111, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":125
- *         return 0
- * 
- *     def add_adj_list(self, offsets_col, indices_col, value_col):             # <<<<<<<<<<<<<<
- *         """
- *         Warp existing gdf columns representing an adjacency list in a gdf_graph.
- */
-  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_7add_adj_list, 0, __pyx_n_s_Graph_add_adj_list, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__15)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 125, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_adj_list, __pyx_t_1) < 0) __PYX_ERR(0, 125, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":145
- * 
- * 
- *     def add_transpose(self):             # <<<<<<<<<<<<<<
- *         """
- *         Compute the transposed adjacency list from the edge list and add it to the existing graph.
- */
-  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_11bfs_wrapper_5Graph_9add_transpose, 0, __pyx_n_s_Graph_add_transpose, NULL, __pyx_n_s_bfs_wrapper, __pyx_d, ((PyObject *)__pyx_codeobj__17)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 145, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (__Pyx_SetNameInClass(__pyx_t_2, __pyx_n_s_add_transpose, __pyx_t_1) < 0) __PYX_ERR(0, 145, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-
-  /* "bfs_wrapper.pyx":44
- *     return col_ptr
- * 
- * class Graph:             # <<<<<<<<<<<<<<
- *     """
- *         cuGraph graph class containing basic graph creation and transformation operations.
- */
-  __pyx_t_1 = __Pyx_Py3ClassCreate(((PyObject*)&__Pyx_DefaultClassType), __pyx_n_s_Graph, __pyx_empty_tuple, __pyx_t_2, NULL, 0, 1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 44, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_1);
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_Graph, __pyx_t_1) < 0) __PYX_ERR(0, 44, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /* "bfs_wrapper.pyx":1
- * from c_bfs cimport *             # <<<<<<<<<<<<<<
- * from libcpp cimport bool
- * from libc.stdint cimport uintptr_t
- */
-  __pyx_t_2 = __Pyx_PyDict_NewPresized(3); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 1, __pyx_L1_error)
-  __Pyx_GOTREF(__pyx_t_2);
-  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_Graph___init___line_48, __pyx_kp_u_Returns_Graph_cuGraph_Graph_Exa) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_Graph_add_edge_list_line_66, __pyx_kp_u_Warp_existing_gdf_columns_repre) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  if (PyDict_SetItem(__pyx_t_2, __pyx_kp_u_bfs_line_152, __pyx_kp_u_Find_the_distances_and_predeces) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_2) < 0) __PYX_ERR(0, 1, __pyx_L1_error)
-  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
-
-  /*--- Wrapped vars code ---*/
-
-  goto __pyx_L0;
-  __pyx_L1_error:;
-  __Pyx_XDECREF(__pyx_t_1);
-  __Pyx_XDECREF(__pyx_t_2);
-  __Pyx_XDECREF(__pyx_t_3);
-  if (__pyx_m) {
-    if (__pyx_d) {
-      __Pyx_AddTraceback("init bfs_wrapper", 0, __pyx_lineno, __pyx_filename);
-    }
-    Py_DECREF(__pyx_m); __pyx_m = 0;
-  } else if (!PyErr_Occurred()) {
-    PyErr_SetString(PyExc_ImportError, "init bfs_wrapper");
-  }
-  __pyx_L0:;
-  __Pyx_RefNannyFinishContext();
-  #if CYTHON_PEP489_MULTI_PHASE_INIT
-  return (__pyx_m != NULL) ? 0 : -1;
-  #elif PY_MAJOR_VERSION >= 3
-  return __pyx_m;
-  #else
-  return;
-  #endif
-}
-
-/* --- Runtime support code --- */
-/* Refnanny */
-#if CYTHON_REFNANNY
-static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
-    PyObject *m = NULL, *p = NULL;
-    void *r = NULL;
-    m = PyImport_ImportModule((char *)modname);
-    if (!m) goto end;
-    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
-    if (!p) goto end;
-    r = PyLong_AsVoidPtr(p);
-end:
-    Py_XDECREF(p);
-    Py_XDECREF(m);
-    return (__Pyx_RefNannyAPIStruct *)r;
-}
-#endif
-
-/* PyObjectGetAttrStr */
-#if CYTHON_USE_TYPE_SLOTS
-static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
-    PyTypeObject* tp = Py_TYPE(obj);
-    if (likely(tp->tp_getattro))
-        return tp->tp_getattro(obj, attr_name);
-#if PY_MAJOR_VERSION < 3
-    if (likely(tp->tp_getattr))
-        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
-#endif
-    return PyObject_GetAttr(obj, attr_name);
-}
-#endif
-
-/* GetBuiltinName */
-static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
-    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
-    if (unlikely(!result)) {
-        PyErr_Format(PyExc_NameError,
-#if PY_MAJOR_VERSION >= 3
-            "name '%U' is not defined", name);
-#else
-            "name '%.200s' is not defined", PyString_AS_STRING(name));
-#endif
-    }
-    return result;
-}
-
-/* GetModuleGlobalName */
-static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
-    PyObject *result;
-#if !CYTHON_AVOID_BORROWED_REFS
-#if CYTHON_COMPILING_IN_CPYTHON && PY_VERSION_HEX >= 0x030500A1
-    result = _PyDict_GetItem_KnownHash(__pyx_d, name, ((PyASCIIObject *) name)->hash);
-    if (likely(result)) {
-        Py_INCREF(result);
-    } else if (unlikely(PyErr_Occurred())) {
-        result = NULL;
-    } else {
-#else
-    result = PyDict_GetItem(__pyx_d, name);
-    if (likely(result)) {
-        Py_INCREF(result);
-    } else {
-#endif
-#else
-    result = PyObject_GetItem(__pyx_d, name);
-    if (!result) {
-        PyErr_Clear();
-#endif
-        result = __Pyx_GetBuiltinName(name);
-    }
-    return result;
-}
-
-/* PyCFunctionFastCall */
-    #if CYTHON_FAST_PYCCALL
-static CYTHON_INLINE PyObject * __Pyx_PyCFunction_FastCall(PyObject *func_obj, PyObject **args, Py_ssize_t nargs) {
-    PyCFunctionObject *func = (PyCFunctionObject*)func_obj;
-    PyCFunction meth = PyCFunction_GET_FUNCTION(func);
-    PyObject *self = PyCFunction_GET_SELF(func);
-    int flags = PyCFunction_GET_FLAGS(func);
-    assert(PyCFunction_Check(func));
-    assert(METH_FASTCALL == (flags & ~(METH_CLASS | METH_STATIC | METH_COEXIST | METH_KEYWORDS)));
-    assert(nargs >= 0);
-    assert(nargs == 0 || args != NULL);
-    /* _PyCFunction_FastCallDict() must not be called with an exception set,
-       because it may clear it (directly or indirectly) and so the
-       caller loses its exception */
-    assert(!PyErr_Occurred());
-    if ((PY_VERSION_HEX < 0x030700A0) || unlikely(flags & METH_KEYWORDS)) {
-        return (*((__Pyx_PyCFunctionFastWithKeywords)meth)) (self, args, nargs, NULL);
-    } else {
-        return (*((__Pyx_PyCFunctionFast)meth)) (self, args, nargs);
-    }
-}
-#endif
-
-/* PyFunctionFastCall */
-    #if CYTHON_FAST_PYCALL
-#include "frameobject.h"
-static PyObject* __Pyx_PyFunction_FastCallNoKw(PyCodeObject *co, PyObject **args, Py_ssize_t na,
-                                               PyObject *globals) {
-    PyFrameObject *f;
-    PyThreadState *tstate = __Pyx_PyThreadState_Current;
-    PyObject **fastlocals;
-    Py_ssize_t i;
-    PyObject *result;
-    assert(globals != NULL);
-    /* XXX Perhaps we should create a specialized
-       PyFrame_New() that doesn't take locals, but does
-       take builtins without sanity checking them.
-       */
-    assert(tstate != NULL);
-    f = PyFrame_New(tstate, co, globals, NULL);
-    if (f == NULL) {
-        return NULL;
-    }
-    fastlocals = f->f_localsplus;
-    for (i = 0; i < na; i++) {
-        Py_INCREF(*args);
-        fastlocals[i] = *args++;
-    }
-    result = PyEval_EvalFrameEx(f,0);
-    ++tstate->recursion_depth;
-    Py_DECREF(f);
-    --tstate->recursion_depth;
-    return result;
-}
-#if 1 || PY_VERSION_HEX < 0x030600B1
-static PyObject *__Pyx_PyFunction_FastCallDict(PyObject *func, PyObject **args, int nargs, PyObject *kwargs) {
-    PyCodeObject *co = (PyCodeObject *)PyFunction_GET_CODE(func);
-    PyObject *globals = PyFunction_GET_GLOBALS(func);
-    PyObject *argdefs = PyFunction_GET_DEFAULTS(func);
-    PyObject *closure;
-#if PY_MAJOR_VERSION >= 3
-    PyObject *kwdefs;
-#endif
-    PyObject *kwtuple, **k;
-    PyObject **d;
-    Py_ssize_t nd;
-    Py_ssize_t nk;
-    PyObject *result;
-    assert(kwargs == NULL || PyDict_Check(kwargs));
-    nk = kwargs ? PyDict_Size(kwargs) : 0;
-    if (Py_EnterRecursiveCall((char*)" while calling a Python object")) {
-        return NULL;
-    }
-    if (
-#if PY_MAJOR_VERSION >= 3
-            co->co_kwonlyargcount == 0 &&
-#endif
-            likely(kwargs == NULL || nk == 0) &&
-            co->co_flags == (CO_OPTIMIZED | CO_NEWLOCALS | CO_NOFREE)) {
-        if (argdefs == NULL && co->co_argcount == nargs) {
-            result = __Pyx_PyFunction_FastCallNoKw(co, args, nargs, globals);
-            goto done;
-        }
-        else if (nargs == 0 && argdefs != NULL
-                 && co->co_argcount == Py_SIZE(argdefs)) {
-            /* function called with no arguments, but all parameters have
-               a default value: use default values as arguments .*/
-            args = &PyTuple_GET_ITEM(argdefs, 0);
-            result =__Pyx_PyFunction_FastCallNoKw(co, args, Py_SIZE(argdefs), globals);
-            goto done;
-        }
-    }
-    if (kwargs != NULL) {
-        Py_ssize_t pos, i;
-        kwtuple = PyTuple_New(2 * nk);
-        if (kwtuple == NULL) {
-            result = NULL;
-            goto done;
-        }
-        k = &PyTuple_GET_ITEM(kwtuple, 0);
-        pos = i = 0;
-        while (PyDict_Next(kwargs, &pos, &k[i], &k[i+1])) {
-            Py_INCREF(k[i]);
-            Py_INCREF(k[i+1]);
-            i += 2;
-        }
-        nk = i / 2;
-    }
-    else {
-        kwtuple = NULL;
-        k = NULL;
-    }
-    closure = PyFunction_GET_CLOSURE(func);
-#if PY_MAJOR_VERSION >= 3
-    kwdefs = PyFunction_GET_KW_DEFAULTS(func);
-#endif
-    if (argdefs != NULL) {
-        d = &PyTuple_GET_ITEM(argdefs, 0);
-        nd = Py_SIZE(argdefs);
-    }
-    else {
-        d = NULL;
-        nd = 0;
-    }
-#if PY_MAJOR_VERSION >= 3
-    result = PyEval_EvalCodeEx((PyObject*)co, globals, (PyObject *)NULL,
-                               args, nargs,
-                               k, (int)nk,
-                               d, (int)nd, kwdefs, closure);
-#else
-    result = PyEval_EvalCodeEx(co, globals, (PyObject *)NULL,
-                               args, nargs,
-                               k, (int)nk,
-                               d, (int)nd, closure);
-#endif
-    Py_XDECREF(kwtuple);
-done:
-    Py_LeaveRecursiveCall();
-    return result;
-}
-#endif
-#endif
-
-/* PyObjectCall */
-    #if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
-    PyObject *result;
-    ternaryfunc call = func->ob_type->tp_call;
-    if (unlikely(!call))
-        return PyObject_Call(func, arg, kw);
-    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
-        return NULL;
-    result = (*call)(func, arg, kw);
-    Py_LeaveRecursiveCall();
-    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
-        PyErr_SetString(
-            PyExc_SystemError,
-            "NULL result without error in PyObject_Call");
-    }
-    return result;
-}
-#endif
-
-/* PyObjectCallMethO */
-    #if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallMethO(PyObject *func, PyObject *arg) {
-    PyObject *self, *result;
-    PyCFunction cfunc;
-    cfunc = PyCFunction_GET_FUNCTION(func);
-    self = PyCFunction_GET_SELF(func);
-    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
-        return NULL;
-    result = cfunc(self, arg);
-    Py_LeaveRecursiveCall();
-    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
-        PyErr_SetString(
-            PyExc_SystemError,
-            "NULL result without error in PyObject_Call");
-    }
-    return result;
-}
-#endif
-
-/* PyObjectCallOneArg */
-    #if CYTHON_COMPILING_IN_CPYTHON
-static PyObject* __Pyx__PyObject_CallOneArg(PyObject *func, PyObject *arg) {
-    PyObject *result;
-    PyObject *args = PyTuple_New(1);
-    if (unlikely(!args)) return NULL;
-    Py_INCREF(arg);
-    PyTuple_SET_ITEM(args, 0, arg);
-    result = __Pyx_PyObject_Call(func, args, NULL);
-    Py_DECREF(args);
-    return result;
-}
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
-#if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(func)) {
-        return __Pyx_PyFunction_FastCall(func, &arg, 1);
-    }
-#endif
-    if (likely(PyCFunction_Check(func))) {
-        if (likely(PyCFunction_GET_FLAGS(func) & METH_O)) {
-            return __Pyx_PyObject_CallMethO(func, arg);
-#if CYTHON_FAST_PYCCALL
-        } else if (PyCFunction_GET_FLAGS(func) & METH_FASTCALL) {
-            return __Pyx_PyCFunction_FastCall(func, &arg, 1);
-#endif
-        }
-    }
-    return __Pyx__PyObject_CallOneArg(func, arg);
-}
-#else
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallOneArg(PyObject *func, PyObject *arg) {
-    PyObject *result;
-    PyObject *args = PyTuple_Pack(1, arg);
-    if (unlikely(!args)) return NULL;
-    result = __Pyx_PyObject_Call(func, args, NULL);
-    Py_DECREF(args);
-    return result;
-}
-#endif
-
-/* PyObjectCallNoArg */
-    #if CYTHON_COMPILING_IN_CPYTHON
-static CYTHON_INLINE PyObject* __Pyx_PyObject_CallNoArg(PyObject *func) {
-#if CYTHON_FAST_PYCALL
-    if (PyFunction_Check(func)) {
-        return __Pyx_PyFunction_FastCall(func, NULL, 0);
-    }
-#endif
-#ifdef __Pyx_CyFunction_USED
-    if (likely(PyCFunction_Check(func) || __Pyx_TypeCheck(func, __pyx_CyFunctionType))) {
-#else
-    if (likely(PyCFunction_Check(func))) {
-#endif
-        if (likely(PyCFunction_GET_FLAGS(func) & METH_NOARGS)) {
-            return __Pyx_PyObject_CallMethO(func, NULL);
-        }
-    }
-    return __Pyx_PyObject_Call(func, __pyx_empty_tuple, NULL);
-}
-#endif
-
-/* GetItemInt */
-      static PyObject *__Pyx_GetItemInt_Generic(PyObject *o, PyObject* j) {
-    PyObject *r;
-    if (!j) return NULL;
-    r = PyObject_GetItem(o, j);
-    Py_DECREF(j);
-    return r;
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_List_Fast(PyObject *o, Py_ssize_t i,
-                                                              CYTHON_NCP_UNUSED int wraparound,
-                                                              CYTHON_NCP_UNUSED int boundscheck) {
-#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
-    Py_ssize_t wrapped_i = i;
-    if (wraparound & unlikely(i < 0)) {
-        wrapped_i += PyList_GET_SIZE(o);
-    }
-    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyList_GET_SIZE(o)))) {
-        PyObject *r = PyList_GET_ITEM(o, wrapped_i);
-        Py_INCREF(r);
-        return r;
-    }
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-#else
-    return PySequence_GetItem(o, i);
-#endif
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Tuple_Fast(PyObject *o, Py_ssize_t i,
-                                                              CYTHON_NCP_UNUSED int wraparound,
-                                                              CYTHON_NCP_UNUSED int boundscheck) {
-#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
-    Py_ssize_t wrapped_i = i;
-    if (wraparound & unlikely(i < 0)) {
-        wrapped_i += PyTuple_GET_SIZE(o);
-    }
-    if ((!boundscheck) || likely((0 <= wrapped_i) & (wrapped_i < PyTuple_GET_SIZE(o)))) {
-        PyObject *r = PyTuple_GET_ITEM(o, wrapped_i);
-        Py_INCREF(r);
-        return r;
-    }
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-#else
-    return PySequence_GetItem(o, i);
-#endif
-}
-static CYTHON_INLINE PyObject *__Pyx_GetItemInt_Fast(PyObject *o, Py_ssize_t i, int is_list,
-                                                     CYTHON_NCP_UNUSED int wraparound,
-                                                     CYTHON_NCP_UNUSED int boundscheck) {
-#if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS && CYTHON_USE_TYPE_SLOTS
-    if (is_list || PyList_CheckExact(o)) {
-        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyList_GET_SIZE(o);
-        if ((!boundscheck) || (likely((n >= 0) & (n < PyList_GET_SIZE(o))))) {
-            PyObject *r = PyList_GET_ITEM(o, n);
-            Py_INCREF(r);
-            return r;
-        }
-    }
-    else if (PyTuple_CheckExact(o)) {
-        Py_ssize_t n = ((!wraparound) | likely(i >= 0)) ? i : i + PyTuple_GET_SIZE(o);
-        if ((!boundscheck) || likely((n >= 0) & (n < PyTuple_GET_SIZE(o)))) {
-            PyObject *r = PyTuple_GET_ITEM(o, n);
-            Py_INCREF(r);
-            return r;
-        }
-    } else {
-        PySequenceMethods *m = Py_TYPE(o)->tp_as_sequence;
-        if (likely(m && m->sq_item)) {
-            if (wraparound && unlikely(i < 0) && likely(m->sq_length)) {
-                Py_ssize_t l = m->sq_length(o);
-                if (likely(l >= 0)) {
-                    i += l;
-                } else {
-                    if (!PyErr_ExceptionMatches(PyExc_OverflowError))
-                        return NULL;
-                    PyErr_Clear();
-                }
-            }
-            return m->sq_item(o, i);
-        }
-    }
-#else
-    if (is_list || PySequence_Check(o)) {
-        return PySequence_GetItem(o, i);
-    }
-#endif
-    return __Pyx_GetItemInt_Generic(o, PyInt_FromSsize_t(i));
-}
-
-/* ObjectGetItem */
-      #if CYTHON_USE_TYPE_SLOTS
-static PyObject *__Pyx_PyObject_GetIndex(PyObject *obj, PyObject* index) {
-    PyObject *runerr;
-    Py_ssize_t key_value;
-    PySequenceMethods *m = Py_TYPE(obj)->tp_as_sequence;
-    if (unlikely(!(m && m->sq_item))) {
-        PyErr_Format(PyExc_TypeError, "'%.200s' object is not subscriptable", Py_TYPE(obj)->tp_name);
-        return NULL;
-    }
-    key_value = __Pyx_PyIndex_AsSsize_t(index);
-    if (likely(key_value != -1 || !(runerr = PyErr_Occurred()))) {
-        return __Pyx_GetItemInt_Fast(obj, key_value, 0, 1, 1);
-    }
-    if (PyErr_GivenExceptionMatches(runerr, PyExc_OverflowError)) {
-        PyErr_Clear();
-        PyErr_Format(PyExc_IndexError, "cannot fit '%.200s' into an index-sized integer", Py_TYPE(index)->tp_name);
-    }
-    return NULL;
-}
-static PyObject *__Pyx_PyObject_GetItem(PyObject *obj, PyObject* key) {
-    PyMappingMethods *m = Py_TYPE(obj)->tp_as_mapping;
-    if (likely(m && m->mp_subscript)) {
-        return m->mp_subscript(obj, key);
-    }
-    return __Pyx_PyObject_GetIndex(obj, key);
-}
-#endif
-
-/* PyObjectSetAttrStr */
-      #if CYTHON_USE_TYPE_SLOTS
-static CYTHON_INLINE int __Pyx_PyObject_SetAttrStr(PyObject* obj, PyObject* attr_name, PyObject* value) {
-    PyTypeObject* tp = Py_TYPE(obj);
-    if (likely(tp->tp_setattro))
-        return tp->tp_setattro(obj, attr_name, value);
-#if PY_MAJOR_VERSION < 3
-    if (likely(tp->tp_setattr))
-        return tp->tp_setattr(obj, PyString_AS_STRING(attr_name), value);
-#endif
-    return PyObject_SetAttr(obj, attr_name, value);
-}
-#endif
-
-/* RaiseArgTupleInvalid */
-      static void __Pyx_RaiseArgtupleInvalid(
-    const char* func_name,
-    int exact,
-    Py_ssize_t num_min,
-    Py_ssize_t num_max,
-    Py_ssize_t num_found)
-{
-    Py_ssize_t num_expected;
-    const char *more_or_less;
-    if (num_found < num_min) {
-        num_expected = num_min;
-        more_or_less = "at least";
-    } else {
-        num_expected = num_max;
-        more_or_less = "at most";
-    }
-    if (exact) {
-        more_or_less = "exactly";
-    }
-    PyErr_Format(PyExc_TypeError,
-                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
-                 func_name, more_or_less, num_expected,
-                 (num_expected == 1) ? "" : "s", num_found);
-}
-
-/* RaiseDoubleKeywords */
-      static void __Pyx_RaiseDoubleKeywordsError(
-    const char* func_name,
-    PyObject* kw_name)
-{
-    PyErr_Format(PyExc_TypeError,
-        #if PY_MAJOR_VERSION >= 3
-        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
-        #else
-        "%s() got multiple values for keyword argument '%s'", func_name,
-        PyString_AsString(kw_name));
-        #endif
-}
-
-/* ParseKeywords */
-      static int __Pyx_ParseOptionalKeywords(
-    PyObject *kwds,
-    PyObject **argnames[],
-    PyObject *kwds2,
-    PyObject *values[],
-    Py_ssize_t num_pos_args,
-    const char* function_name)
-{
-    PyObject *key = 0, *value = 0;
-    Py_ssize_t pos = 0;
-    PyObject*** name;
-    PyObject*** first_kw_arg = argnames + num_pos_args;
-    while (PyDict_Next(kwds, &pos, &key, &value)) {
-        name = first_kw_arg;
-        while (*name && (**name != key)) name++;
-        if (*name) {
-            values[name-argnames] = value;
-            continue;
-        }
-        name = first_kw_arg;
-        #if PY_MAJOR_VERSION < 3
-        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
-            while (*name) {
-                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
-                        && _PyString_Eq(**name, key)) {
-                    values[name-argnames] = value;
-                    break;
-                }
-                name++;
-            }
-            if (*name) continue;
-            else {
-                PyObject*** argname = argnames;
-                while (argname != first_kw_arg) {
-                    if ((**argname == key) || (
-                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
-                             && _PyString_Eq(**argname, key))) {
-                        goto arg_passed_twice;
-                    }
-                    argname++;
-                }
-            }
-        } else
-        #endif
-        if (likely(PyUnicode_Check(key))) {
-            while (*name) {
-                int cmp = (**name == key) ? 0 :
-                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
-                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
-                #endif
-                    PyUnicode_Compare(**name, key);
-                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
-                if (cmp == 0) {
-                    values[name-argnames] = value;
-                    break;
-                }
-                name++;
-            }
-            if (*name) continue;
-            else {
-                PyObject*** argname = argnames;
-                while (argname != first_kw_arg) {
-                    int cmp = (**argname == key) ? 0 :
-                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
-                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
-                    #endif
-                        PyUnicode_Compare(**argname, key);
-                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
-                    if (cmp == 0) goto arg_passed_twice;
-                    argname++;
-                }
-            }
-        } else
-            goto invalid_keyword_type;
-        if (kwds2) {
-            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
-        } else {
-            goto invalid_keyword;
-        }
-    }
-    return 0;
-arg_passed_twice:
-    __Pyx_RaiseDoubleKeywordsError(function_name, key);
-    goto bad;
-invalid_keyword_type:
-    PyErr_Format(PyExc_TypeError,
-        "%.200s() keywords must be strings", function_name);
-    goto bad;
-invalid_keyword:
-    PyErr_Format(PyExc_TypeError,
-    #if PY_MAJOR_VERSION < 3
-        "%.200s() got an unexpected keyword argument '%.200s'",
-        function_name, PyString_AsString(key));
-    #else
-        "%s() got an unexpected keyword argument '%U'",
-        function_name, key);
-    #endif
-bad:
-    return -1;
-}
-
-/* Import */
-      static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
-    PyObject *empty_list = 0;
-    PyObject *module = 0;
-    PyObject *global_dict = 0;
-    PyObject *empty_dict = 0;
-    PyObject *list;
-    #if PY_MAJOR_VERSION < 3
-    PyObject *py_import;
-    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
-    if (!py_import)
-        goto bad;
-    #endif
-    if (from_list)
-        list = from_list;
-    else {
-        empty_list = PyList_New(0);
-        if (!empty_list)
-            goto bad;
-        list = empty_list;
-    }
-    global_dict = PyModule_GetDict(__pyx_m);
-    if (!global_dict)
-        goto bad;
-    empty_dict = PyDict_New();
-    if (!empty_dict)
-        goto bad;
-    {
-        #if PY_MAJOR_VERSION >= 3
-        if (level == -1) {
-            if (strchr(__Pyx_MODULE_NAME, '.')) {
-                module = PyImport_ImportModuleLevelObject(
-                    name, global_dict, empty_dict, list, 1);
-                if (!module) {
-                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
-                        goto bad;
-                    PyErr_Clear();
-                }
-            }
-            level = 0;
-        }
-        #endif
-        if (!module) {
-            #if PY_MAJOR_VERSION < 3
-            PyObject *py_level = PyInt_FromLong(level);
-            if (!py_level)
-                goto bad;
-            module = PyObject_CallFunctionObjArgs(py_import,
-                name, global_dict, empty_dict, list, py_level, NULL);
-            Py_DECREF(py_level);
-            #else
-            module = PyImport_ImportModuleLevelObject(
-                name, global_dict, empty_dict, list, level);
-            #endif
-        }
-    }
-bad:
-    #if PY_MAJOR_VERSION < 3
-    Py_XDECREF(py_import);
-    #endif
-    Py_XDECREF(empty_list);
-    Py_XDECREF(empty_dict);
-    return module;
-}
-
-/* ImportFrom */
-      static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) {
-    PyObject* value = __Pyx_PyObject_GetAttrStr(module, name);
-    if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) {
-        PyErr_Format(PyExc_ImportError,
-        #if PY_MAJOR_VERSION < 3
-            "cannot import name %.230s", PyString_AS_STRING(name));
-        #else
-            "cannot import name %S", name);
-        #endif
-    }
-    return value;
-}
-
-/* FetchCommonType */
-      static PyTypeObject* __Pyx_FetchCommonType(PyTypeObject* type) {
-    PyObject* fake_module;
-    PyTypeObject* cached_type = NULL;
-    fake_module = PyImport_AddModule((char*) "_cython_" CYTHON_ABI);
-    if (!fake_module) return NULL;
-    Py_INCREF(fake_module);
-    cached_type = (PyTypeObject*) PyObject_GetAttrString(fake_module, type->tp_name);
-    if (cached_type) {
-        if (!PyType_Check((PyObject*)cached_type)) {
-            PyErr_Format(PyExc_TypeError,
-                "Shared Cython type %.200s is not a type object",
-                type->tp_name);
-            goto bad;
-        }
-        if (cached_type->tp_basicsize != type->tp_basicsize) {
-            PyErr_Format(PyExc_TypeError,
-                "Shared Cython type %.200s has the wrong size, try recompiling",
-                type->tp_name);
-            goto bad;
-        }
-    } else {
-        if (!PyErr_ExceptionMatches(PyExc_AttributeError)) goto bad;
-        PyErr_Clear();
-        if (PyType_Ready(type) < 0) goto bad;
-        if (PyObject_SetAttrString(fake_module, type->tp_name, (PyObject*) type) < 0)
-            goto bad;
-        Py_INCREF(type);
-        cached_type = type;
-    }
-done:
-    Py_DECREF(fake_module);
-    return cached_type;
-bad:
-    Py_XDECREF(cached_type);
-    cached_type = NULL;
-    goto done;
-}
-
-/* CythonFunction */
-      #include <structmember.h>
-static PyObject *
-__Pyx_CyFunction_get_doc(__pyx_CyFunctionObject *op, CYTHON_UNUSED void *closure)
-{
-    if (unlikely(op->func_doc == NULL)) {
-        if (op->func.m_ml->ml_doc) {
-#if PY_MAJOR_VERSION >= 3
-            op->func_doc = PyUnicode_FromString(op->func.m_ml->ml_doc);
-#else
-            op->func_doc = PyString_FromString(op->func.m_ml->ml_doc);
-#endif
-            if (unlikely(op->func_doc == NULL))
-                return NULL;
-        } else {
-            Py_INCREF(Py_None);
-            return Py_None;
-        }
-    }
-    Py_INCREF(op->func_doc);
-    return op->func_doc;
-}
-static int
-__Pyx_CyFunction_set_doc(__pyx_CyFunctionObject *op, PyObject *value)
-{
-    PyObject *tmp = op->func_doc;
-    if (value == NULL) {
-        value = Py_None;
-    }
-    Py_INCREF(value);
-    op->func_doc = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_name(__pyx_CyFunctionObject *op)
-{
-    if (unlikely(op->func_name == NULL)) {
-#if PY_MAJOR_VERSION >= 3
-        op->func_name = PyUnicode_InternFromString(op->func.m_ml->ml_name);
-#else
-        op->func_name = PyString_InternFromString(op->func.m_ml->ml_name);
-#endif
-        if (unlikely(op->func_name == NULL))
-            return NULL;
-    }
-    Py_INCREF(op->func_name);
-    return op->func_name;
-}
-static int
-__Pyx_CyFunction_set_name(__pyx_CyFunctionObject *op, PyObject *value)
-{
-    PyObject *tmp;
-#if PY_MAJOR_VERSION >= 3
-    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
-#else
-    if (unlikely(value == NULL || !PyString_Check(value))) {
-#endif
-        PyErr_SetString(PyExc_TypeError,
-                        "__name__ must be set to a string object");
-        return -1;
-    }
-    tmp = op->func_name;
-    Py_INCREF(value);
-    op->func_name = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_qualname(__pyx_CyFunctionObject *op)
-{
-    Py_INCREF(op->func_qualname);
-    return op->func_qualname;
-}
-static int
-__Pyx_CyFunction_set_qualname(__pyx_CyFunctionObject *op, PyObject *value)
-{
-    PyObject *tmp;
-#if PY_MAJOR_VERSION >= 3
-    if (unlikely(value == NULL || !PyUnicode_Check(value))) {
-#else
-    if (unlikely(value == NULL || !PyString_Check(value))) {
-#endif
-        PyErr_SetString(PyExc_TypeError,
-                        "__qualname__ must be set to a string object");
-        return -1;
-    }
-    tmp = op->func_qualname;
-    Py_INCREF(value);
-    op->func_qualname = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_self(__pyx_CyFunctionObject *m, CYTHON_UNUSED void *closure)
-{
-    PyObject *self;
-    self = m->func_closure;
-    if (self == NULL)
-        self = Py_None;
-    Py_INCREF(self);
-    return self;
-}
-static PyObject *
-__Pyx_CyFunction_get_dict(__pyx_CyFunctionObject *op)
-{
-    if (unlikely(op->func_dict == NULL)) {
-        op->func_dict = PyDict_New();
-        if (unlikely(op->func_dict == NULL))
-            return NULL;
-    }
-    Py_INCREF(op->func_dict);
-    return op->func_dict;
-}
-static int
-__Pyx_CyFunction_set_dict(__pyx_CyFunctionObject *op, PyObject *value)
-{
-    PyObject *tmp;
-    if (unlikely(value == NULL)) {
-        PyErr_SetString(PyExc_TypeError,
-               "function's dictionary may not be deleted");
-        return -1;
-    }
-    if (unlikely(!PyDict_Check(value))) {
-        PyErr_SetString(PyExc_TypeError,
-               "setting function's dictionary to a non-dict");
-        return -1;
-    }
-    tmp = op->func_dict;
-    Py_INCREF(value);
-    op->func_dict = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_globals(__pyx_CyFunctionObject *op)
-{
-    Py_INCREF(op->func_globals);
-    return op->func_globals;
-}
-static PyObject *
-__Pyx_CyFunction_get_closure(CYTHON_UNUSED __pyx_CyFunctionObject *op)
-{
-    Py_INCREF(Py_None);
-    return Py_None;
-}
-static PyObject *
-__Pyx_CyFunction_get_code(__pyx_CyFunctionObject *op)
-{
-    PyObject* result = (op->func_code) ? op->func_code : Py_None;
-    Py_INCREF(result);
-    return result;
-}
-static int
-__Pyx_CyFunction_init_defaults(__pyx_CyFunctionObject *op) {
-    int result = 0;
-    PyObject *res = op->defaults_getter((PyObject *) op);
-    if (unlikely(!res))
-        return -1;
-    #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
-    op->defaults_tuple = PyTuple_GET_ITEM(res, 0);
-    Py_INCREF(op->defaults_tuple);
-    op->defaults_kwdict = PyTuple_GET_ITEM(res, 1);
-    Py_INCREF(op->defaults_kwdict);
-    #else
-    op->defaults_tuple = PySequence_ITEM(res, 0);
-    if (unlikely(!op->defaults_tuple)) result = -1;
-    else {
-        op->defaults_kwdict = PySequence_ITEM(res, 1);
-        if (unlikely(!op->defaults_kwdict)) result = -1;
-    }
-    #endif
-    Py_DECREF(res);
-    return result;
-}
-static int
-__Pyx_CyFunction_set_defaults(__pyx_CyFunctionObject *op, PyObject* value) {
-    PyObject* tmp;
-    if (!value) {
-        value = Py_None;
-    } else if (value != Py_None && !PyTuple_Check(value)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "__defaults__ must be set to a tuple object");
-        return -1;
-    }
-    Py_INCREF(value);
-    tmp = op->defaults_tuple;
-    op->defaults_tuple = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_defaults(__pyx_CyFunctionObject *op) {
-    PyObject* result = op->defaults_tuple;
-    if (unlikely(!result)) {
-        if (op->defaults_getter) {
-            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
-            result = op->defaults_tuple;
-        } else {
-            result = Py_None;
-        }
-    }
-    Py_INCREF(result);
-    return result;
-}
-static int
-__Pyx_CyFunction_set_kwdefaults(__pyx_CyFunctionObject *op, PyObject* value) {
-    PyObject* tmp;
-    if (!value) {
-        value = Py_None;
-    } else if (value != Py_None && !PyDict_Check(value)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "__kwdefaults__ must be set to a dict object");
-        return -1;
-    }
-    Py_INCREF(value);
-    tmp = op->defaults_kwdict;
-    op->defaults_kwdict = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_kwdefaults(__pyx_CyFunctionObject *op) {
-    PyObject* result = op->defaults_kwdict;
-    if (unlikely(!result)) {
-        if (op->defaults_getter) {
-            if (__Pyx_CyFunction_init_defaults(op) < 0) return NULL;
-            result = op->defaults_kwdict;
-        } else {
-            result = Py_None;
-        }
-    }
-    Py_INCREF(result);
-    return result;
-}
-static int
-__Pyx_CyFunction_set_annotations(__pyx_CyFunctionObject *op, PyObject* value) {
-    PyObject* tmp;
-    if (!value || value == Py_None) {
-        value = NULL;
-    } else if (!PyDict_Check(value)) {
-        PyErr_SetString(PyExc_TypeError,
-                        "__annotations__ must be set to a dict object");
-        return -1;
-    }
-    Py_XINCREF(value);
-    tmp = op->func_annotations;
-    op->func_annotations = value;
-    Py_XDECREF(tmp);
-    return 0;
-}
-static PyObject *
-__Pyx_CyFunction_get_annotations(__pyx_CyFunctionObject *op) {
-    PyObject* result = op->func_annotations;
-    if (unlikely(!result)) {
-        result = PyDict_New();
-        if (unlikely(!result)) return NULL;
-        op->func_annotations = result;
-    }
-    Py_INCREF(result);
-    return result;
-}
-static PyGetSetDef __pyx_CyFunction_getsets[] = {
-    {(char *) "func_doc", (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
-    {(char *) "__doc__",  (getter)__Pyx_CyFunction_get_doc, (setter)__Pyx_CyFunction_set_doc, 0, 0},
-    {(char *) "func_name", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
-    {(char *) "__name__", (getter)__Pyx_CyFunction_get_name, (setter)__Pyx_CyFunction_set_name, 0, 0},
-    {(char *) "__qualname__", (getter)__Pyx_CyFunction_get_qualname, (setter)__Pyx_CyFunction_set_qualname, 0, 0},
-    {(char *) "__self__", (getter)__Pyx_CyFunction_get_self, 0, 0, 0},
-    {(char *) "func_dict", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
-    {(char *) "__dict__", (getter)__Pyx_CyFunction_get_dict, (setter)__Pyx_CyFunction_set_dict, 0, 0},
-    {(char *) "func_globals", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
-    {(char *) "__globals__", (getter)__Pyx_CyFunction_get_globals, 0, 0, 0},
-    {(char *) "func_closure", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
-    {(char *) "__closure__", (getter)__Pyx_CyFunction_get_closure, 0, 0, 0},
-    {(char *) "func_code", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
-    {(char *) "__code__", (getter)__Pyx_CyFunction_get_code, 0, 0, 0},
-    {(char *) "func_defaults", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
-    {(char *) "__defaults__", (getter)__Pyx_CyFunction_get_defaults, (setter)__Pyx_CyFunction_set_defaults, 0, 0},
-    {(char *) "__kwdefaults__", (getter)__Pyx_CyFunction_get_kwdefaults, (setter)__Pyx_CyFunction_set_kwdefaults, 0, 0},
-    {(char *) "__annotations__", (getter)__Pyx_CyFunction_get_annotations, (setter)__Pyx_CyFunction_set_annotations, 0, 0},
-    {0, 0, 0, 0, 0}
-};
-static PyMemberDef __pyx_CyFunction_members[] = {
-    {(char *) "__module__", T_OBJECT, offsetof(PyCFunctionObject, m_module), PY_WRITE_RESTRICTED, 0},
-    {0, 0, 0,  0, 0}
-};
-static PyObject *
-__Pyx_CyFunction_reduce(__pyx_CyFunctionObject *m, CYTHON_UNUSED PyObject *args)
-{
-#if PY_MAJOR_VERSION >= 3
-    return PyUnicode_FromString(m->func.m_ml->ml_name);
-#else
-    return PyString_FromString(m->func.m_ml->ml_name);
-#endif
-}
-static PyMethodDef __pyx_CyFunction_methods[] = {
-    {"__reduce__", (PyCFunction)__Pyx_CyFunction_reduce, METH_VARARGS, 0},
-    {0, 0, 0, 0}
-};
-#if PY_VERSION_HEX < 0x030500A0
-#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func_weakreflist)
-#else
-#define __Pyx_CyFunction_weakreflist(cyfunc) ((cyfunc)->func.m_weakreflist)
-#endif
-static PyObject *__Pyx_CyFunction_New(PyTypeObject *type, PyMethodDef *ml, int flags, PyObject* qualname,
-                                      PyObject *closure, PyObject *module, PyObject* globals, PyObject* code) {
-    __pyx_CyFunctionObject *op = PyObject_GC_New(__pyx_CyFunctionObject, type);
-    if (op == NULL)
-        return NULL;
-    op->flags = flags;
-    __Pyx_CyFunction_weakreflist(op) = NULL;
-    op->func.m_ml = ml;
-    op->func.m_self = (PyObject *) op;
-    Py_XINCREF(closure);
-    op->func_closure = closure;
-    Py_XINCREF(module);
-    op->func.m_module = module;
-    op->func_dict = NULL;
-    op->func_name = NULL;
-    Py_INCREF(qualname);
-    op->func_qualname = qualname;
-    op->func_doc = NULL;
-    op->func_classobj = NULL;
-    op->func_globals = globals;
-    Py_INCREF(op->func_globals);
-    Py_XINCREF(code);
-    op->func_code = code;
-    op->defaults_pyobjects = 0;
-    op->defaults = NULL;
-    op->defaults_tuple = NULL;
-    op->defaults_kwdict = NULL;
-    op->defaults_getter = NULL;
-    op->func_annotations = NULL;
-    PyObject_GC_Track(op);
-    return (PyObject *) op;
-}
-static int
-__Pyx_CyFunction_clear(__pyx_CyFunctionObject *m)
-{
-    Py_CLEAR(m->func_closure);
-    Py_CLEAR(m->func.m_module);
-    Py_CLEAR(m->func_dict);
-    Py_CLEAR(m->func_name);
-    Py_CLEAR(m->func_qualname);
-    Py_CLEAR(m->func_doc);
-    Py_CLEAR(m->func_globals);
-    Py_CLEAR(m->func_code);
-    Py_CLEAR(m->func_classobj);
-    Py_CLEAR(m->defaults_tuple);
-    Py_CLEAR(m->defaults_kwdict);
-    Py_CLEAR(m->func_annotations);
-    if (m->defaults) {
-        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
-        int i;
-        for (i = 0; i < m->defaults_pyobjects; i++)
-            Py_XDECREF(pydefaults[i]);
-        PyObject_Free(m->defaults);
-        m->defaults = NULL;
-    }
-    return 0;
-}
-static void __Pyx__CyFunction_dealloc(__pyx_CyFunctionObject *m)
-{
-    if (__Pyx_CyFunction_weakreflist(m) != NULL)
-        PyObject_ClearWeakRefs((PyObject *) m);
-    __Pyx_CyFunction_clear(m);
-    PyObject_GC_Del(m);
-}
-static void __Pyx_CyFunction_dealloc(__pyx_CyFunctionObject *m)
-{
-    PyObject_GC_UnTrack(m);
-    __Pyx__CyFunction_dealloc(m);
-}
-static int __Pyx_CyFunction_traverse(__pyx_CyFunctionObject *m, visitproc visit, void *arg)
-{
-    Py_VISIT(m->func_closure);
-    Py_VISIT(m->func.m_module);
-    Py_VISIT(m->func_dict);
-    Py_VISIT(m->func_name);
-    Py_VISIT(m->func_qualname);
-    Py_VISIT(m->func_doc);
-    Py_VISIT(m->func_globals);
-    Py_VISIT(m->func_code);
-    Py_VISIT(m->func_classobj);
-    Py_VISIT(m->defaults_tuple);
-    Py_VISIT(m->defaults_kwdict);
-    if (m->defaults) {
-        PyObject **pydefaults = __Pyx_CyFunction_Defaults(PyObject *, m);
-        int i;
-        for (i = 0; i < m->defaults_pyobjects; i++)
-            Py_VISIT(pydefaults[i]);
-    }
-    return 0;
-}
-static PyObject *__Pyx_CyFunction_descr_get(PyObject *func, PyObject *obj, PyObject *type)
-{
-    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
-    if (m->flags & __Pyx_CYFUNCTION_STATICMETHOD) {
-        Py_INCREF(func);
-        return func;
-    }
-    if (m->flags & __Pyx_CYFUNCTION_CLASSMETHOD) {
-        if (type == NULL)
-            type = (PyObject *)(Py_TYPE(obj));
-        return __Pyx_PyMethod_New(func, type, (PyObject *)(Py_TYPE(type)));
-    }
-    if (obj == Py_None)
-        obj = NULL;
-    return __Pyx_PyMethod_New(func, obj, type);
-}
-static PyObject*
-__Pyx_CyFunction_repr(__pyx_CyFunctionObject *op)
-{
-#if PY_MAJOR_VERSION >= 3
-    return PyUnicode_FromFormat("<cyfunction %U at %p>",
-                                op->func_qualname, (void *)op);
-#else
-    return PyString_FromFormat("<cyfunction %s at %p>",
-                               PyString_AsString(op->func_qualname), (void *)op);
-#endif
-}
-static PyObject * __Pyx_CyFunction_CallMethod(PyObject *func, PyObject *self, PyObject *arg, PyObject *kw) {
-    PyCFunctionObject* f = (PyCFunctionObject*)func;
-    PyCFunction meth = f->m_ml->ml_meth;
-    Py_ssize_t size;
-    switch (f->m_ml->ml_flags & (METH_VARARGS | METH_KEYWORDS | METH_NOARGS | METH_O)) {
-    case METH_VARARGS:
-        if (likely(kw == NULL || PyDict_Size(kw) == 0))
-            return (*meth)(self, arg);
-        break;
-    case METH_VARARGS | METH_KEYWORDS:
-        return (*(PyCFunctionWithKeywords)meth)(self, arg, kw);
-    case METH_NOARGS:
-        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
-            size = PyTuple_GET_SIZE(arg);
-            if (likely(size == 0))
-                return (*meth)(self, NULL);
-            PyErr_Format(PyExc_TypeError,
-                "%.200s() takes no arguments (%" CYTHON_FORMAT_SSIZE_T "d given)",
-                f->m_ml->ml_name, size);
-            return NULL;
-        }
-        break;
-    case METH_O:
-        if (likely(kw == NULL || PyDict_Size(kw) == 0)) {
-            size = PyTuple_GET_SIZE(arg);
-            if (likely(size == 1)) {
-                PyObject *result, *arg0;
-                #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
-                arg0 = PyTuple_GET_ITEM(arg, 0);
-                #else
-                arg0 = PySequence_ITEM(arg, 0); if (unlikely(!arg0)) return NULL;
-                #endif
-                result = (*meth)(self, arg0);
-                #if !(CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS)
-                Py_DECREF(arg0);
-                #endif
-                return result;
-            }
-            PyErr_Format(PyExc_TypeError,
-                "%.200s() takes exactly one argument (%" CYTHON_FORMAT_SSIZE_T "d given)",
-                f->m_ml->ml_name, size);
-            return NULL;
-        }
-        break;
-    default:
-        PyErr_SetString(PyExc_SystemError, "Bad call flags in "
-                        "__Pyx_CyFunction_Call. METH_OLDARGS is no "
-                        "longer supported!");
-        return NULL;
-    }
-    PyErr_Format(PyExc_TypeError, "%.200s() takes no keyword arguments",
-                 f->m_ml->ml_name);
-    return NULL;
-}
-static CYTHON_INLINE PyObject *__Pyx_CyFunction_Call(PyObject *func, PyObject *arg, PyObject *kw) {
-    return __Pyx_CyFunction_CallMethod(func, ((PyCFunctionObject*)func)->m_self, arg, kw);
-}
-static PyObject *__Pyx_CyFunction_CallAsMethod(PyObject *func, PyObject *args, PyObject *kw) {
-    PyObject *result;
-    __pyx_CyFunctionObject *cyfunc = (__pyx_CyFunctionObject *) func;
-    if ((cyfunc->flags & __Pyx_CYFUNCTION_CCLASS) && !(cyfunc->flags & __Pyx_CYFUNCTION_STATICMETHOD)) {
-        Py_ssize_t argc;
-        PyObject *new_args;
-        PyObject *self;
-        argc = PyTuple_GET_SIZE(args);
-        new_args = PyTuple_GetSlice(args, 1, argc);
-        if (unlikely(!new_args))
-            return NULL;
-        self = PyTuple_GetItem(args, 0);
-        if (unlikely(!self)) {
-            Py_DECREF(new_args);
-            return NULL;
-        }
-        result = __Pyx_CyFunction_CallMethod(func, self, new_args, kw);
-        Py_DECREF(new_args);
-    } else {
-        result = __Pyx_CyFunction_Call(func, args, kw);
-    }
-    return result;
-}
-static PyTypeObject __pyx_CyFunctionType_type = {
-    PyVarObject_HEAD_INIT(0, 0)
-    "cython_function_or_method",
-    sizeof(__pyx_CyFunctionObject),
-    0,
-    (destructor) __Pyx_CyFunction_dealloc,
-    0,
-    0,
-    0,
-#if PY_MAJOR_VERSION < 3
-    0,
-#else
-    0,
-#endif
-    (reprfunc) __Pyx_CyFunction_repr,
-    0,
-    0,
-    0,
-    0,
-    __Pyx_CyFunction_CallAsMethod,
-    0,
-    0,
-    0,
-    0,
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
-    0,
-    (traverseproc) __Pyx_CyFunction_traverse,
-    (inquiry) __Pyx_CyFunction_clear,
-    0,
-#if PY_VERSION_HEX < 0x030500A0
-    offsetof(__pyx_CyFunctionObject, func_weakreflist),
-#else
-    offsetof(PyCFunctionObject, m_weakreflist),
-#endif
-    0,
-    0,
-    __pyx_CyFunction_methods,
-    __pyx_CyFunction_members,
-    __pyx_CyFunction_getsets,
-    0,
-    0,
-    __Pyx_CyFunction_descr_get,
-    0,
-    offsetof(__pyx_CyFunctionObject, func_dict),
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-    0,
-#if PY_VERSION_HEX >= 0x030400a1
-    0,
-#endif
-};
-static int __pyx_CyFunction_init(void) {
-    __pyx_CyFunctionType = __Pyx_FetchCommonType(&__pyx_CyFunctionType_type);
-    if (unlikely(__pyx_CyFunctionType == NULL)) {
-        return -1;
-    }
-    return 0;
-}
-static CYTHON_INLINE void *__Pyx_CyFunction_InitDefaults(PyObject *func, size_t size, int pyobjects) {
-    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
-    m->defaults = PyObject_Malloc(size);
-    if (unlikely(!m->defaults))
-        return PyErr_NoMemory();
-    memset(m->defaults, 0, size);
-    m->defaults_pyobjects = pyobjects;
-    return m->defaults;
-}
-static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsTuple(PyObject *func, PyObject *tuple) {
-    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
-    m->defaults_tuple = tuple;
-    Py_INCREF(tuple);
-}
-static CYTHON_INLINE void __Pyx_CyFunction_SetDefaultsKwDict(PyObject *func, PyObject *dict) {
-    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
-    m->defaults_kwdict = dict;
-    Py_INCREF(dict);
-}
-static CYTHON_INLINE void __Pyx_CyFunction_SetAnnotationsDict(PyObject *func, PyObject *dict) {
-    __pyx_CyFunctionObject *m = (__pyx_CyFunctionObject *) func;
-    m->func_annotations = dict;
-    Py_INCREF(dict);
-}
-
-/* CalculateMetaclass */
-          static PyObject *__Pyx_CalculateMetaclass(PyTypeObject *metaclass, PyObject *bases) {
-    Py_ssize_t i, nbases = PyTuple_GET_SIZE(bases);
-    for (i=0; i < nbases; i++) {
-        PyTypeObject *tmptype;
-        PyObject *tmp = PyTuple_GET_ITEM(bases, i);
-        tmptype = Py_TYPE(tmp);
-#if PY_MAJOR_VERSION < 3
-        if (tmptype == &PyClass_Type)
-            continue;
-#endif
-        if (!metaclass) {
-            metaclass = tmptype;
-            continue;
-        }
-        if (PyType_IsSubtype(metaclass, tmptype))
-            continue;
-        if (PyType_IsSubtype(tmptype, metaclass)) {
-            metaclass = tmptype;
-            continue;
-        }
-        PyErr_SetString(PyExc_TypeError,
-                        "metaclass conflict: "
-                        "the metaclass of a derived class "
-                        "must be a (non-strict) subclass "
-                        "of the metaclasses of all its bases");
-        return NULL;
-    }
-    if (!metaclass) {
-#if PY_MAJOR_VERSION < 3
-        metaclass = &PyClass_Type;
-#else
-        metaclass = &PyType_Type;
-#endif
-    }
-    Py_INCREF((PyObject*) metaclass);
-    return (PyObject*) metaclass;
-}
-
-/* Py3ClassCreate */
-          static PyObject *__Pyx_Py3MetaclassPrepare(PyObject *metaclass, PyObject *bases, PyObject *name,
-                                           PyObject *qualname, PyObject *mkw, PyObject *modname, PyObject *doc) {
-    PyObject *ns;
-    if (metaclass) {
-        PyObject *prep = __Pyx_PyObject_GetAttrStr(metaclass, __pyx_n_s_prepare);
-        if (prep) {
-            PyObject *pargs = PyTuple_Pack(2, name, bases);
-            if (unlikely(!pargs)) {
-                Py_DECREF(prep);
-                return NULL;
-            }
-            ns = PyObject_Call(prep, pargs, mkw);
-            Py_DECREF(prep);
-            Py_DECREF(pargs);
-        } else {
-            if (unlikely(!PyErr_ExceptionMatches(PyExc_AttributeError)))
-                return NULL;
-            PyErr_Clear();
-            ns = PyDict_New();
-        }
-    } else {
-        ns = PyDict_New();
-    }
-    if (unlikely(!ns))
-        return NULL;
-    if (unlikely(PyObject_SetItem(ns, __pyx_n_s_module, modname) < 0)) goto bad;
-    if (unlikely(PyObject_SetItem(ns, __pyx_n_s_qualname, qualname) < 0)) goto bad;
-    if (unlikely(doc && PyObject_SetItem(ns, __pyx_n_s_doc, doc) < 0)) goto bad;
-    return ns;
-bad:
-    Py_DECREF(ns);
-    return NULL;
-}
-static PyObject *__Pyx_Py3ClassCreate(PyObject *metaclass, PyObject *name, PyObject *bases,
-                                      PyObject *dict, PyObject *mkw,
-                                      int calculate_metaclass, int allow_py2_metaclass) {
-    PyObject *result, *margs;
-    PyObject *owned_metaclass = NULL;
-    if (allow_py2_metaclass) {
-        owned_metaclass = PyObject_GetItem(dict, __pyx_n_s_metaclass);
-        if (owned_metaclass) {
-            metaclass = owned_metaclass;
-        } else if (likely(PyErr_ExceptionMatches(PyExc_KeyError))) {
-            PyErr_Clear();
-        } else {
-            return NULL;
-        }
-    }
-    if (calculate_metaclass && (!metaclass || PyType_Check(metaclass))) {
-        metaclass = __Pyx_CalculateMetaclass((PyTypeObject*) metaclass, bases);
-        Py_XDECREF(owned_metaclass);
-        if (unlikely(!metaclass))
-            return NULL;
-        owned_metaclass = metaclass;
-    }
-    margs = PyTuple_Pack(3, name, bases, dict);
-    if (unlikely(!margs)) {
-        result = NULL;
-    } else {
-        result = PyObject_Call(metaclass, margs, mkw);
-        Py_DECREF(margs);
-    }
-    Py_XDECREF(owned_metaclass);
-    return result;
-}
-
-/* PyErrFetchRestore */
-          #if CYTHON_FAST_THREAD_STATE
-static CYTHON_INLINE void __Pyx_ErrRestoreInState(PyThreadState *tstate, PyObject *type, PyObject *value, PyObject *tb) {
-    PyObject *tmp_type, *tmp_value, *tmp_tb;
-    tmp_type = tstate->curexc_type;
-    tmp_value = tstate->curexc_value;
-    tmp_tb = tstate->curexc_traceback;
-    tstate->curexc_type = type;
-    tstate->curexc_value = value;
-    tstate->curexc_traceback = tb;
-    Py_XDECREF(tmp_type);
-    Py_XDECREF(tmp_value);
-    Py_XDECREF(tmp_tb);
-}
-static CYTHON_INLINE void __Pyx_ErrFetchInState(PyThreadState *tstate, PyObject **type, PyObject **value, PyObject **tb) {
-    *type = tstate->curexc_type;
-    *value = tstate->curexc_value;
-    *tb = tstate->curexc_traceback;
-    tstate->curexc_type = 0;
-    tstate->curexc_value = 0;
-    tstate->curexc_traceback = 0;
-}
-#endif
-
-/* CLineInTraceback */
-          #ifndef CYTHON_CLINE_IN_TRACEBACK
-static int __Pyx_CLineForTraceback(CYTHON_UNUSED PyThreadState *tstate, int c_line) {
-    PyObject *use_cline;
-    PyObject *ptype, *pvalue, *ptraceback;
-#if CYTHON_COMPILING_IN_CPYTHON
-    PyObject **cython_runtime_dict;
-#endif
-    if (unlikely(!__pyx_cython_runtime)) {
-        return c_line;
-    }
-    __Pyx_ErrFetchInState(tstate, &ptype, &pvalue, &ptraceback);
-#if CYTHON_COMPILING_IN_CPYTHON
-    cython_runtime_dict = _PyObject_GetDictPtr(__pyx_cython_runtime);
-    if (likely(cython_runtime_dict)) {
-      use_cline = __Pyx_PyDict_GetItemStr(*cython_runtime_dict, __pyx_n_s_cline_in_traceback);
-    } else
-#endif
-    {
-      PyObject *use_cline_obj = __Pyx_PyObject_GetAttrStr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback);
-      if (use_cline_obj) {
-        use_cline = PyObject_Not(use_cline_obj) ? Py_False : Py_True;
-        Py_DECREF(use_cline_obj);
-      } else {
-        PyErr_Clear();
-        use_cline = NULL;
-      }
-    }
-    if (!use_cline) {
-        c_line = 0;
-        PyObject_SetAttr(__pyx_cython_runtime, __pyx_n_s_cline_in_traceback, Py_False);
-    }
-    else if (PyObject_Not(use_cline) != 0) {
-        c_line = 0;
-    }
-    __Pyx_ErrRestoreInState(tstate, ptype, pvalue, ptraceback);
-    return c_line;
-}
-#endif
-
-/* CodeObjectCache */
-          static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
-    int start = 0, mid = 0, end = count - 1;
-    if (end >= 0 && code_line > entries[end].code_line) {
-        return count;
-    }
-    while (start < end) {
-        mid = start + (end - start) / 2;
-        if (code_line < entries[mid].code_line) {
-            end = mid;
-        } else if (code_line > entries[mid].code_line) {
-             start = mid + 1;
-        } else {
-            return mid;
-        }
-    }
-    if (code_line <= entries[mid].code_line) {
-        return mid;
-    } else {
-        return mid + 1;
-    }
-}
-static PyCodeObject *__pyx_find_code_object(int code_line) {
-    PyCodeObject* code_object;
-    int pos;
-    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
-        return NULL;
-    }
-    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
-    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
-        return NULL;
-    }
-    code_object = __pyx_code_cache.entries[pos].code_object;
-    Py_INCREF(code_object);
-    return code_object;
-}
-static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
-    int pos, i;
-    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
-    if (unlikely(!code_line)) {
-        return;
-    }
-    if (unlikely(!entries)) {
-        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
-        if (likely(entries)) {
-            __pyx_code_cache.entries = entries;
-            __pyx_code_cache.max_count = 64;
-            __pyx_code_cache.count = 1;
-            entries[0].code_line = code_line;
-            entries[0].code_object = code_object;
-            Py_INCREF(code_object);
-        }
-        return;
-    }
-    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
-    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
-        PyCodeObject* tmp = entries[pos].code_object;
-        entries[pos].code_object = code_object;
-        Py_DECREF(tmp);
-        return;
-    }
-    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
-        int new_max = __pyx_code_cache.max_count + 64;
-        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
-            __pyx_code_cache.entries, (size_t)new_max*sizeof(__Pyx_CodeObjectCacheEntry));
-        if (unlikely(!entries)) {
-            return;
-        }
-        __pyx_code_cache.entries = entries;
-        __pyx_code_cache.max_count = new_max;
-    }
-    for (i=__pyx_code_cache.count; i>pos; i--) {
-        entries[i] = entries[i-1];
-    }
-    entries[pos].code_line = code_line;
-    entries[pos].code_object = code_object;
-    __pyx_code_cache.count++;
-    Py_INCREF(code_object);
-}
-
-/* AddTraceback */
-          #include "compile.h"
-#include "frameobject.h"
-#include "traceback.h"
-static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
-            const char *funcname, int c_line,
-            int py_line, const char *filename) {
-    PyCodeObject *py_code = 0;
-    PyObject *py_srcfile = 0;
-    PyObject *py_funcname = 0;
-    #if PY_MAJOR_VERSION < 3
-    py_srcfile = PyString_FromString(filename);
-    #else
-    py_srcfile = PyUnicode_FromString(filename);
-    #endif
-    if (!py_srcfile) goto bad;
-    if (c_line) {
-        #if PY_MAJOR_VERSION < 3
-        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
-        #else
-        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
-        #endif
-    }
-    else {
-        #if PY_MAJOR_VERSION < 3
-        py_funcname = PyString_FromString(funcname);
-        #else
-        py_funcname = PyUnicode_FromString(funcname);
-        #endif
-    }
-    if (!py_funcname) goto bad;
-    py_code = __Pyx_PyCode_New(
-        0,
-        0,
-        0,
-        0,
-        0,
-        __pyx_empty_bytes, /*PyObject *code,*/
-        __pyx_empty_tuple, /*PyObject *consts,*/
-        __pyx_empty_tuple, /*PyObject *names,*/
-        __pyx_empty_tuple, /*PyObject *varnames,*/
-        __pyx_empty_tuple, /*PyObject *freevars,*/
-        __pyx_empty_tuple, /*PyObject *cellvars,*/
-        py_srcfile,   /*PyObject *filename,*/
-        py_funcname,  /*PyObject *name,*/
-        py_line,
-        __pyx_empty_bytes  /*PyObject *lnotab*/
-    );
-    Py_DECREF(py_srcfile);
-    Py_DECREF(py_funcname);
-    return py_code;
-bad:
-    Py_XDECREF(py_srcfile);
-    Py_XDECREF(py_funcname);
-    return NULL;
-}
-static void __Pyx_AddTraceback(const char *funcname, int c_line,
-                               int py_line, const char *filename) {
-    PyCodeObject *py_code = 0;
-    PyFrameObject *py_frame = 0;
-    PyThreadState *tstate = __Pyx_PyThreadState_Current;
-    if (c_line) {
-        c_line = __Pyx_CLineForTraceback(tstate, c_line);
-    }
-    py_code = __pyx_find_code_object(c_line ? -c_line : py_line);
-    if (!py_code) {
-        py_code = __Pyx_CreateCodeObjectForTraceback(
-            funcname, c_line, py_line, filename);
-        if (!py_code) goto bad;
-        __pyx_insert_code_object(c_line ? -c_line : py_line, py_code);
-    }
-    py_frame = PyFrame_New(
-        tstate,            /*PyThreadState *tstate,*/
-        py_code,           /*PyCodeObject *code,*/
-        __pyx_d,    /*PyObject *globals,*/
-        0                  /*PyObject *locals*/
-    );
-    if (!py_frame) goto bad;
-    __Pyx_PyFrame_SetLineNumber(py_frame, py_line);
-    PyTraceBack_Here(py_frame);
-bad:
-    Py_XDECREF(py_code);
-    Py_XDECREF(py_frame);
-}
-
-/* CIntToPy */
-          static CYTHON_INLINE PyObject* __Pyx_PyInt_From_gdf_dtype(gdf_dtype value) {
-    const gdf_dtype neg_one = (gdf_dtype) -1, const_zero = (gdf_dtype) 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(gdf_dtype) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(gdf_dtype) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-#ifdef HAVE_LONG_LONG
-        } else if (sizeof(gdf_dtype) <= sizeof(unsigned PY_LONG_LONG)) {
-            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
-#endif
-        }
-    } else {
-        if (sizeof(gdf_dtype) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-#ifdef HAVE_LONG_LONG
-        } else if (sizeof(gdf_dtype) <= sizeof(PY_LONG_LONG)) {
-            return PyLong_FromLongLong((PY_LONG_LONG) value);
-#endif
-        }
-    }
-    {
-        int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
-        return _PyLong_FromByteArray(bytes, sizeof(gdf_dtype),
-                                     little, !is_unsigned);
-    }
-}
-
-/* CIntFromPyVerify */
-          #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func_value)\
-    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 0)
-#define __PYX_VERIFY_RETURN_INT_EXC(target_type, func_type, func_value)\
-    __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, 1)
-#define __PYX__VERIFY_RETURN_INT(target_type, func_type, func_value, exc)\
-    {\
-        func_type value = func_value;\
-        if (sizeof(target_type) < sizeof(func_type)) {\
-            if (unlikely(value != (func_type) (target_type) value)) {\
-                func_type zero = 0;\
-                if (exc && unlikely(value == (func_type)-1 && PyErr_Occurred()))\
-                    return (target_type) -1;\
-                if (is_unsigned && unlikely(value < zero))\
-                    goto raise_neg_overflow;\
-                else\
-                    goto raise_overflow;\
-            }\
-        }\
-        return (target_type) value;\
-    }
-
-/* Print */
-          #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
-static PyObject *__Pyx_GetStdout(void) {
-    PyObject *f = PySys_GetObject((char *)"stdout");
-    if (!f) {
-        PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
-    }
-    return f;
-}
-static int __Pyx_Print(PyObject* f, PyObject *arg_tuple, int newline) {
-    int i;
-    if (!f) {
-        if (!(f = __Pyx_GetStdout()))
-            return -1;
-    }
-    Py_INCREF(f);
-    for (i=0; i < PyTuple_GET_SIZE(arg_tuple); i++) {
-        PyObject* v;
-        if (PyFile_SoftSpace(f, 1)) {
-            if (PyFile_WriteString(" ", f) < 0)
-                goto error;
-        }
-        v = PyTuple_GET_ITEM(arg_tuple, i);
-        if (PyFile_WriteObject(v, f, Py_PRINT_RAW) < 0)
-            goto error;
-        if (PyString_Check(v)) {
-            char *s = PyString_AsString(v);
-            Py_ssize_t len = PyString_Size(v);
-            if (len > 0) {
-                switch (s[len-1]) {
-                    case ' ': break;
-                    case '\f': case '\r': case '\n': case '\t': case '\v':
-                        PyFile_SoftSpace(f, 0);
-                        break;
-                    default:  break;
-                }
-            }
-        }
-    }
-    if (newline) {
-        if (PyFile_WriteString("\n", f) < 0)
-            goto error;
-        PyFile_SoftSpace(f, 0);
-    }
-    Py_DECREF(f);
-    return 0;
-error:
-    Py_DECREF(f);
-    return -1;
-}
-#else
-static int __Pyx_Print(PyObject* stream, PyObject *arg_tuple, int newline) {
-    PyObject* kwargs = 0;
-    PyObject* result = 0;
-    PyObject* end_string;
-    if (unlikely(!__pyx_print)) {
-        __pyx_print = PyObject_GetAttr(__pyx_b, __pyx_n_s_print);
-        if (!__pyx_print)
-            return -1;
-    }
-    if (stream) {
-        kwargs = PyDict_New();
-        if (unlikely(!kwargs))
-            return -1;
-        if (unlikely(PyDict_SetItem(kwargs, __pyx_n_s_file, stream) < 0))
-            goto bad;
-        if (!newline) {
-            end_string = PyUnicode_FromStringAndSize(" ", 1);
-            if (unlikely(!end_string))
-                goto bad;
-            if (PyDict_SetItem(kwargs, __pyx_n_s_end, end_string) < 0) {
-                Py_DECREF(end_string);
-                goto bad;
-            }
-            Py_DECREF(end_string);
-        }
-    } else if (!newline) {
-        if (unlikely(!__pyx_print_kwargs)) {
-            __pyx_print_kwargs = PyDict_New();
-            if (unlikely(!__pyx_print_kwargs))
-                return -1;
-            end_string = PyUnicode_FromStringAndSize(" ", 1);
-            if (unlikely(!end_string))
-                return -1;
-            if (PyDict_SetItem(__pyx_print_kwargs, __pyx_n_s_end, end_string) < 0) {
-                Py_DECREF(end_string);
-                return -1;
-            }
-            Py_DECREF(end_string);
-        }
-        kwargs = __pyx_print_kwargs;
-    }
-    result = PyObject_Call(__pyx_print, arg_tuple, kwargs);
-    if (unlikely(kwargs) && (kwargs != __pyx_print_kwargs))
-        Py_DECREF(kwargs);
-    if (!result)
-        return -1;
-    Py_DECREF(result);
-    return 0;
-bad:
-    if (kwargs != __pyx_print_kwargs)
-        Py_XDECREF(kwargs);
-    return -1;
-}
-#endif
-
-/* CIntFromPy */
-          static CYTHON_INLINE size_t __Pyx_PyInt_As_size_t(PyObject *x) {
-    const size_t neg_one = (size_t) -1, const_zero = (size_t) 0;
-    const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(size_t) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(size_t, long, PyInt_AS_LONG(x))
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                goto raise_neg_overflow;
-            }
-            return (size_t) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (size_t) 0;
-                case  1: __PYX_VERIFY_RETURN_INT(size_t, digit, digits[0])
-                case 2:
-                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) >= 2 * PyLong_SHIFT) {
-                            return (size_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) >= 3 * PyLong_SHIFT) {
-                            return (size_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) >= 4 * PyLong_SHIFT) {
-                            return (size_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-                        }
-                    }
-                    break;
-            }
-#endif
-#if CYTHON_COMPILING_IN_CPYTHON
-            if (unlikely(Py_SIZE(x) < 0)) {
-                goto raise_neg_overflow;
-            }
-#else
-            {
-                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
-                if (unlikely(result < 0))
-                    return (size_t) -1;
-                if (unlikely(result == 1))
-                    goto raise_neg_overflow;
-            }
-#endif
-            if (sizeof(size_t) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned long, PyLong_AsUnsignedLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(size_t) <= sizeof(unsigned PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(size_t, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
-#endif
-            }
-        } else {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (size_t) 0;
-                case -1: __PYX_VERIFY_RETURN_INT(size_t, sdigit, (sdigit) (-(sdigit)digits[0]))
-                case  1: __PYX_VERIFY_RETURN_INT(size_t,  digit, +digits[0])
-                case -2:
-                    if (8 * sizeof(size_t) - 1 > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
-                            return (size_t) (((size_t)-1)*(((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-                case 2:
-                    if (8 * sizeof(size_t) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
-                            return (size_t) ((((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-                case -3:
-                    if (8 * sizeof(size_t) - 1 > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
-                            return (size_t) (((size_t)-1)*(((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(size_t) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
-                            return (size_t) ((((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-                case -4:
-                    if (8 * sizeof(size_t) - 1 > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
-                            return (size_t) (((size_t)-1)*(((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(size_t) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(size_t, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(size_t) - 1 > 4 * PyLong_SHIFT) {
-                            return (size_t) ((((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0])));
-                        }
-                    }
-                    break;
-            }
-#endif
-            if (sizeof(size_t) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(size_t, long, PyLong_AsLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(size_t) <= sizeof(PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(size_t, PY_LONG_LONG, PyLong_AsLongLong(x))
-#endif
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            size_t val;
-            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
-#endif
-            return (size_t) -1;
-        }
-    } else {
-        size_t val;
-        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
-        if (!tmp) return (size_t) -1;
-        val = __Pyx_PyInt_As_size_t(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-raise_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "value too large to convert to size_t");
-    return (size_t) -1;
-raise_neg_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "can't convert negative value to size_t");
-    return (size_t) -1;
-}
-
-/* CIntFromPy */
-          static CYTHON_INLINE gdf_dtype __Pyx_PyInt_As_gdf_dtype(PyObject *x) {
-    const gdf_dtype neg_one = (gdf_dtype) -1, const_zero = (gdf_dtype) 0;
-    const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(gdf_dtype) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, PyInt_AS_LONG(x))
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                goto raise_neg_overflow;
-            }
-            return (gdf_dtype) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (gdf_dtype) 0;
-                case  1: __PYX_VERIFY_RETURN_INT(gdf_dtype, digit, digits[0])
-                case 2:
-                    if (8 * sizeof(gdf_dtype) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) >= 2 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(gdf_dtype) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) >= 3 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(gdf_dtype) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) >= 4 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0]));
-                        }
-                    }
-                    break;
-            }
-#endif
-#if CYTHON_COMPILING_IN_CPYTHON
-            if (unlikely(Py_SIZE(x) < 0)) {
-                goto raise_neg_overflow;
-            }
-#else
-            {
-                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
-                if (unlikely(result < 0))
-                    return (gdf_dtype) -1;
-                if (unlikely(result == 1))
-                    goto raise_neg_overflow;
-            }
-#endif
-            if (sizeof(gdf_dtype) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, unsigned long, PyLong_AsUnsignedLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(gdf_dtype) <= sizeof(unsigned PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
-#endif
-            }
-        } else {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (gdf_dtype) 0;
-                case -1: __PYX_VERIFY_RETURN_INT(gdf_dtype, sdigit, (sdigit) (-(sdigit)digits[0]))
-                case  1: __PYX_VERIFY_RETURN_INT(gdf_dtype,  digit, +digits[0])
-                case -2:
-                    if (8 * sizeof(gdf_dtype) - 1 > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((gdf_dtype)-1)*(((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-                case 2:
-                    if (8 * sizeof(gdf_dtype) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
-                            return (gdf_dtype) ((((((gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-                case -3:
-                    if (8 * sizeof(gdf_dtype) - 1 > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((gdf_dtype)-1)*(((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(gdf_dtype) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
-                            return (gdf_dtype) ((((((((gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-                case -4:
-                    if (8 * sizeof(gdf_dtype) - 1 > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 4 * PyLong_SHIFT) {
-                            return (gdf_dtype) (((gdf_dtype)-1)*(((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(gdf_dtype) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(gdf_dtype, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(gdf_dtype) - 1 > 4 * PyLong_SHIFT) {
-                            return (gdf_dtype) ((((((((((gdf_dtype)digits[3]) << PyLong_SHIFT) | (gdf_dtype)digits[2]) << PyLong_SHIFT) | (gdf_dtype)digits[1]) << PyLong_SHIFT) | (gdf_dtype)digits[0])));
-                        }
-                    }
-                    break;
-            }
-#endif
-            if (sizeof(gdf_dtype) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, long, PyLong_AsLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(gdf_dtype) <= sizeof(PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(gdf_dtype, PY_LONG_LONG, PyLong_AsLongLong(x))
-#endif
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            gdf_dtype val;
-            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
-#endif
-            return (gdf_dtype) -1;
-        }
-    } else {
-        gdf_dtype val;
-        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
-        if (!tmp) return (gdf_dtype) -1;
-        val = __Pyx_PyInt_As_gdf_dtype(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-raise_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "value too large to convert to gdf_dtype");
-    return (gdf_dtype) -1;
-raise_neg_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "can't convert negative value to gdf_dtype");
-    return (gdf_dtype) -1;
-}
-
-/* CIntFromPy */
-          static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
-    const int neg_one = (int) -1, const_zero = (int) 0;
-    const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(int) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG(x))
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                goto raise_neg_overflow;
-            }
-            return (int) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (int) 0;
-                case  1: __PYX_VERIFY_RETURN_INT(int, digit, digits[0])
-                case 2:
-                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) >= 2 * PyLong_SHIFT) {
-                            return (int) (((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) >= 3 * PyLong_SHIFT) {
-                            return (int) (((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) >= 4 * PyLong_SHIFT) {
-                            return (int) (((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0]));
-                        }
-                    }
-                    break;
-            }
-#endif
-#if CYTHON_COMPILING_IN_CPYTHON
-            if (unlikely(Py_SIZE(x) < 0)) {
-                goto raise_neg_overflow;
-            }
-#else
-            {
-                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
-                if (unlikely(result < 0))
-                    return (int) -1;
-                if (unlikely(result == 1))
-                    goto raise_neg_overflow;
-            }
-#endif
-            if (sizeof(int) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned long, PyLong_AsUnsignedLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(int) <= sizeof(unsigned PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(int, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
-#endif
-            }
-        } else {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (int) 0;
-                case -1: __PYX_VERIFY_RETURN_INT(int, sdigit, (sdigit) (-(sdigit)digits[0]))
-                case  1: __PYX_VERIFY_RETURN_INT(int,  digit, +digits[0])
-                case -2:
-                    if (8 * sizeof(int) - 1 > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
-                            return (int) (((int)-1)*(((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-                case 2:
-                    if (8 * sizeof(int) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
-                            return (int) ((((((int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-                case -3:
-                    if (8 * sizeof(int) - 1 > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
-                            return (int) (((int)-1)*(((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(int) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
-                            return (int) ((((((((int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-                case -4:
-                    if (8 * sizeof(int) - 1 > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
-                            return (int) (((int)-1)*(((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(int) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(int, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(int) - 1 > 4 * PyLong_SHIFT) {
-                            return (int) ((((((((((int)digits[3]) << PyLong_SHIFT) | (int)digits[2]) << PyLong_SHIFT) | (int)digits[1]) << PyLong_SHIFT) | (int)digits[0])));
-                        }
-                    }
-                    break;
-            }
-#endif
-            if (sizeof(int) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(int, long, PyLong_AsLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(int) <= sizeof(PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(int, PY_LONG_LONG, PyLong_AsLongLong(x))
-#endif
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            int val;
-            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
-#endif
-            return (int) -1;
-        }
-    } else {
-        int val;
-        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
-        if (!tmp) return (int) -1;
-        val = __Pyx_PyInt_As_int(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-raise_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "value too large to convert to int");
-    return (int) -1;
-raise_neg_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "can't convert negative value to int");
-    return (int) -1;
-}
-
-/* PrintOne */
-          #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION < 3
-static int __Pyx_PrintOne(PyObject* f, PyObject *o) {
-    if (!f) {
-        if (!(f = __Pyx_GetStdout()))
-            return -1;
-    }
-    Py_INCREF(f);
-    if (PyFile_SoftSpace(f, 0)) {
-        if (PyFile_WriteString(" ", f) < 0)
-            goto error;
-    }
-    if (PyFile_WriteObject(o, f, Py_PRINT_RAW) < 0)
-        goto error;
-    if (PyFile_WriteString("\n", f) < 0)
-        goto error;
-    Py_DECREF(f);
-    return 0;
-error:
-    Py_DECREF(f);
-    return -1;
-    /* the line below is just to avoid C compiler
-     * warnings about unused functions */
-    return __Pyx_Print(f, NULL, 0);
-}
-#else
-static int __Pyx_PrintOne(PyObject* stream, PyObject *o) {
-    int res;
-    PyObject* arg_tuple = PyTuple_Pack(1, o);
-    if (unlikely(!arg_tuple))
-        return -1;
-    res = __Pyx_Print(stream, arg_tuple, 1);
-    Py_DECREF(arg_tuple);
-    return res;
-}
-#endif
-
-/* CIntToPy */
-          static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
-    const long neg_one = (long) -1, const_zero = (long) 0;
-    const int is_unsigned = neg_one > const_zero;
-    if (is_unsigned) {
-        if (sizeof(long) < sizeof(long)) {
-            return PyInt_FromLong((long) value);
-        } else if (sizeof(long) <= sizeof(unsigned long)) {
-            return PyLong_FromUnsignedLong((unsigned long) value);
-#ifdef HAVE_LONG_LONG
-        } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
-            return PyLong_FromUnsignedLongLong((unsigned PY_LONG_LONG) value);
-#endif
-        }
-    } else {
-        if (sizeof(long) <= sizeof(long)) {
-            return PyInt_FromLong((long) value);
-#ifdef HAVE_LONG_LONG
-        } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
-            return PyLong_FromLongLong((PY_LONG_LONG) value);
-#endif
-        }
-    }
-    {
-        int one = 1; int little = (int)*(unsigned char *)&one;
-        unsigned char *bytes = (unsigned char *)&value;
-        return _PyLong_FromByteArray(bytes, sizeof(long),
-                                     little, !is_unsigned);
-    }
-}
-
-/* CIntFromPy */
-          static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
-    const long neg_one = (long) -1, const_zero = (long) 0;
-    const int is_unsigned = neg_one > const_zero;
-#if PY_MAJOR_VERSION < 3
-    if (likely(PyInt_Check(x))) {
-        if (sizeof(long) < sizeof(long)) {
-            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG(x))
-        } else {
-            long val = PyInt_AS_LONG(x);
-            if (is_unsigned && unlikely(val < 0)) {
-                goto raise_neg_overflow;
-            }
-            return (long) val;
-        }
-    } else
-#endif
-    if (likely(PyLong_Check(x))) {
-        if (is_unsigned) {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (long) 0;
-                case  1: __PYX_VERIFY_RETURN_INT(long, digit, digits[0])
-                case 2:
-                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) >= 2 * PyLong_SHIFT) {
-                            return (long) (((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) >= 3 * PyLong_SHIFT) {
-                            return (long) (((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) >= 4 * PyLong_SHIFT) {
-                            return (long) (((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0]));
-                        }
-                    }
-                    break;
-            }
-#endif
-#if CYTHON_COMPILING_IN_CPYTHON
-            if (unlikely(Py_SIZE(x) < 0)) {
-                goto raise_neg_overflow;
-            }
-#else
-            {
-                int result = PyObject_RichCompareBool(x, Py_False, Py_LT);
-                if (unlikely(result < 0))
-                    return (long) -1;
-                if (unlikely(result == 1))
-                    goto raise_neg_overflow;
-            }
-#endif
-            if (sizeof(long) <= sizeof(unsigned long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned long, PyLong_AsUnsignedLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(long) <= sizeof(unsigned PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(long, unsigned PY_LONG_LONG, PyLong_AsUnsignedLongLong(x))
-#endif
-            }
-        } else {
-#if CYTHON_USE_PYLONG_INTERNALS
-            const digit* digits = ((PyLongObject*)x)->ob_digit;
-            switch (Py_SIZE(x)) {
-                case  0: return (long) 0;
-                case -1: __PYX_VERIFY_RETURN_INT(long, sdigit, (sdigit) (-(sdigit)digits[0]))
-                case  1: __PYX_VERIFY_RETURN_INT(long,  digit, +digits[0])
-                case -2:
-                    if (8 * sizeof(long) - 1 > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
-                            return (long) (((long)-1)*(((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-                case 2:
-                    if (8 * sizeof(long) > 1 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 2 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
-                            return (long) ((((((long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-                case -3:
-                    if (8 * sizeof(long) - 1 > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
-                            return (long) (((long)-1)*(((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-                case 3:
-                    if (8 * sizeof(long) > 2 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 3 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
-                            return (long) ((((((((long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-                case -4:
-                    if (8 * sizeof(long) - 1 > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, long, -(long) (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
-                            return (long) (((long)-1)*(((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-                case 4:
-                    if (8 * sizeof(long) > 3 * PyLong_SHIFT) {
-                        if (8 * sizeof(unsigned long) > 4 * PyLong_SHIFT) {
-                            __PYX_VERIFY_RETURN_INT(long, unsigned long, (((((((((unsigned long)digits[3]) << PyLong_SHIFT) | (unsigned long)digits[2]) << PyLong_SHIFT) | (unsigned long)digits[1]) << PyLong_SHIFT) | (unsigned long)digits[0])))
-                        } else if (8 * sizeof(long) - 1 > 4 * PyLong_SHIFT) {
-                            return (long) ((((((((((long)digits[3]) << PyLong_SHIFT) | (long)digits[2]) << PyLong_SHIFT) | (long)digits[1]) << PyLong_SHIFT) | (long)digits[0])));
-                        }
-                    }
-                    break;
-            }
-#endif
-            if (sizeof(long) <= sizeof(long)) {
-                __PYX_VERIFY_RETURN_INT_EXC(long, long, PyLong_AsLong(x))
-#ifdef HAVE_LONG_LONG
-            } else if (sizeof(long) <= sizeof(PY_LONG_LONG)) {
-                __PYX_VERIFY_RETURN_INT_EXC(long, PY_LONG_LONG, PyLong_AsLongLong(x))
-#endif
-            }
-        }
-        {
-#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
-            PyErr_SetString(PyExc_RuntimeError,
-                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
-#else
-            long val;
-            PyObject *v = __Pyx_PyNumber_IntOrLong(x);
- #if PY_MAJOR_VERSION < 3
-            if (likely(v) && !PyLong_Check(v)) {
-                PyObject *tmp = v;
-                v = PyNumber_Long(tmp);
-                Py_DECREF(tmp);
-            }
- #endif
-            if (likely(v)) {
-                int one = 1; int is_little = (int)*(unsigned char *)&one;
-                unsigned char *bytes = (unsigned char *)&val;
-                int ret = _PyLong_AsByteArray((PyLongObject *)v,
-                                              bytes, sizeof(val),
-                                              is_little, !is_unsigned);
-                Py_DECREF(v);
-                if (likely(!ret))
-                    return val;
-            }
-#endif
-            return (long) -1;
-        }
-    } else {
-        long val;
-        PyObject *tmp = __Pyx_PyNumber_IntOrLong(x);
-        if (!tmp) return (long) -1;
-        val = __Pyx_PyInt_As_long(tmp);
-        Py_DECREF(tmp);
-        return val;
-    }
-raise_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "value too large to convert to long");
-    return (long) -1;
-raise_neg_overflow:
-    PyErr_SetString(PyExc_OverflowError,
-        "can't convert negative value to long");
-    return (long) -1;
-}
-
-/* FastTypeChecks */
-          #if CYTHON_COMPILING_IN_CPYTHON
-static int __Pyx_InBases(PyTypeObject *a, PyTypeObject *b) {
-    while (a) {
-        a = a->tp_base;
-        if (a == b)
-            return 1;
-    }
-    return b == &PyBaseObject_Type;
-}
-static CYTHON_INLINE int __Pyx_IsSubtype(PyTypeObject *a, PyTypeObject *b) {
-    PyObject *mro;
-    if (a == b) return 1;
-    mro = a->tp_mro;
-    if (likely(mro)) {
-        Py_ssize_t i, n;
-        n = PyTuple_GET_SIZE(mro);
-        for (i = 0; i < n; i++) {
-            if (PyTuple_GET_ITEM(mro, i) == (PyObject *)b)
-                return 1;
-        }
-        return 0;
-    }
-    return __Pyx_InBases(a, b);
-}
-#if PY_MAJOR_VERSION == 2
-static int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject* exc_type2) {
-    PyObject *exception, *value, *tb;
-    int res;
-    __Pyx_PyThreadState_declare
-    __Pyx_PyThreadState_assign
-    __Pyx_ErrFetch(&exception, &value, &tb);
-    res = exc_type1 ? PyObject_IsSubclass(err, exc_type1) : 0;
-    if (unlikely(res == -1)) {
-        PyErr_WriteUnraisable(err);
-        res = 0;
-    }
-    if (!res) {
-        res = PyObject_IsSubclass(err, exc_type2);
-        if (unlikely(res == -1)) {
-            PyErr_WriteUnraisable(err);
-            res = 0;
-        }
-    }
-    __Pyx_ErrRestore(exception, value, tb);
-    return res;
-}
-#else
-static CYTHON_INLINE int __Pyx_inner_PyErr_GivenExceptionMatches2(PyObject *err, PyObject* exc_type1, PyObject *exc_type2) {
-    int res = exc_type1 ? __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type1) : 0;
-    if (!res) {
-        res = __Pyx_IsSubtype((PyTypeObject*)err, (PyTypeObject*)exc_type2);
-    }
-    return res;
-}
-#endif
-static int __Pyx_PyErr_GivenExceptionMatchesTuple(PyObject *exc_type, PyObject *tuple) {
-    Py_ssize_t i, n;
-    assert(PyExceptionClass_Check(exc_type));
-    n = PyTuple_GET_SIZE(tuple);
-#if PY_MAJOR_VERSION >= 3
-    for (i=0; i<n; i++) {
-        if (exc_type == PyTuple_GET_ITEM(tuple, i)) return 1;
-    }
-#endif
-    for (i=0; i<n; i++) {
-        PyObject *t = PyTuple_GET_ITEM(tuple, i);
-        #if PY_MAJOR_VERSION < 3
-        if (likely(exc_type == t)) return 1;
-        #endif
-        if (likely(PyExceptionClass_Check(t))) {
-            if (__Pyx_inner_PyErr_GivenExceptionMatches2(exc_type, NULL, t)) return 1;
-        } else {
-        }
-    }
-    return 0;
-}
-static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches(PyObject *err, PyObject* exc_type) {
-    if (likely(err == exc_type)) return 1;
-    if (likely(PyExceptionClass_Check(err))) {
-        if (likely(PyExceptionClass_Check(exc_type))) {
-            return __Pyx_inner_PyErr_GivenExceptionMatches2(err, NULL, exc_type);
-        } else if (likely(PyTuple_Check(exc_type))) {
-            return __Pyx_PyErr_GivenExceptionMatchesTuple(err, exc_type);
-        } else {
-        }
-    }
-    return PyErr_GivenExceptionMatches(err, exc_type);
-}
-static CYTHON_INLINE int __Pyx_PyErr_GivenExceptionMatches2(PyObject *err, PyObject *exc_type1, PyObject *exc_type2) {
-    assert(PyExceptionClass_Check(exc_type1));
-    assert(PyExceptionClass_Check(exc_type2));
-    if (likely(err == exc_type1 || err == exc_type2)) return 1;
-    if (likely(PyExceptionClass_Check(err))) {
-        return __Pyx_inner_PyErr_GivenExceptionMatches2(err, exc_type1, exc_type2);
-    }
-    return (PyErr_GivenExceptionMatches(err, exc_type1) || PyErr_GivenExceptionMatches(err, exc_type2));
-}
-#endif
-
-/* CheckBinaryVersion */
-          static int __Pyx_check_binary_version(void) {
-    char ctversion[4], rtversion[4];
-    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
-    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
-    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
-        char message[200];
-        PyOS_snprintf(message, sizeof(message),
-                      "compiletime version %s of module '%.100s' "
-                      "does not match runtime version %s",
-                      ctversion, __Pyx_MODULE_NAME, rtversion);
-        return PyErr_WarnEx(NULL, message, 1);
-    }
-    return 0;
-}
-
-/* InitStrings */
-          static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
-    while (t->p) {
-        #if PY_MAJOR_VERSION < 3
-        if (t->is_unicode) {
-            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
-        } else if (t->intern) {
-            *t->p = PyString_InternFromString(t->s);
-        } else {
-            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
-        }
-        #else
-        if (t->is_unicode | t->is_str) {
-            if (t->intern) {
-                *t->p = PyUnicode_InternFromString(t->s);
-            } else if (t->encoding) {
-                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
-            } else {
-                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
-            }
-        } else {
-            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
-        }
-        #endif
-        if (!*t->p)
-            return -1;
-        if (PyObject_Hash(*t->p) == -1)
-            return -1;
-        ++t;
-    }
-    return 0;
-}
-
-static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(const char* c_str) {
-    return __Pyx_PyUnicode_FromStringAndSize(c_str, (Py_ssize_t)strlen(c_str));
-}
-static CYTHON_INLINE const char* __Pyx_PyObject_AsString(PyObject* o) {
-    Py_ssize_t ignore;
-    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
-}
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
-#if !CYTHON_PEP393_ENABLED
-static const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
-    char* defenc_c;
-    PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
-    if (!defenc) return NULL;
-    defenc_c = PyBytes_AS_STRING(defenc);
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-    {
-        char* end = defenc_c + PyBytes_GET_SIZE(defenc);
-        char* c;
-        for (c = defenc_c; c < end; c++) {
-            if ((unsigned char) (*c) >= 128) {
-                PyUnicode_AsASCIIString(o);
-                return NULL;
-            }
-        }
-    }
-#endif
-    *length = PyBytes_GET_SIZE(defenc);
-    return defenc_c;
-}
-#else
-static CYTHON_INLINE const char* __Pyx_PyUnicode_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
-    if (unlikely(__Pyx_PyUnicode_READY(o) == -1)) return NULL;
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-    if (likely(PyUnicode_IS_ASCII(o))) {
-        *length = PyUnicode_GET_LENGTH(o);
-        return PyUnicode_AsUTF8(o);
-    } else {
-        PyUnicode_AsASCIIString(o);
-        return NULL;
-    }
-#else
-    return PyUnicode_AsUTF8AndSize(o, length);
-#endif
-}
-#endif
-#endif
-static CYTHON_INLINE const char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
-#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
-    if (
-#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
-            __Pyx_sys_getdefaultencoding_not_ascii &&
-#endif
-            PyUnicode_Check(o)) {
-        return __Pyx_PyUnicode_AsStringAndSize(o, length);
-    } else
-#endif
-#if (!CYTHON_COMPILING_IN_PYPY) || (defined(PyByteArray_AS_STRING) && defined(PyByteArray_GET_SIZE))
-    if (PyByteArray_Check(o)) {
-        *length = PyByteArray_GET_SIZE(o);
-        return PyByteArray_AS_STRING(o);
-    } else
-#endif
-    {
-        char* result;
-        int r = PyBytes_AsStringAndSize(o, &result, length);
-        if (unlikely(r < 0)) {
-            return NULL;
-        } else {
-            return result;
-        }
-    }
-}
-static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
-   int is_true = x == Py_True;
-   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
-   else return PyObject_IsTrue(x);
-}
-static PyObject* __Pyx_PyNumber_IntOrLongWrongResultType(PyObject* result, const char* type_name) {
-#if PY_MAJOR_VERSION >= 3
-    if (PyLong_Check(result)) {
-        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
-                "__int__ returned non-int (type %.200s).  "
-                "The ability to return an instance of a strict subclass of int "
-                "is deprecated, and may be removed in a future version of Python.",
-                Py_TYPE(result)->tp_name)) {
-            Py_DECREF(result);
-            return NULL;
-        }
-        return result;
-    }
-#endif
-    PyErr_Format(PyExc_TypeError,
-                 "__%.4s__ returned non-%.4s (type %.200s)",
-                 type_name, type_name, Py_TYPE(result)->tp_name);
-    Py_DECREF(result);
-    return NULL;
-}
-static CYTHON_INLINE PyObject* __Pyx_PyNumber_IntOrLong(PyObject* x) {
-#if CYTHON_USE_TYPE_SLOTS
-  PyNumberMethods *m;
-#endif
-  const char *name = NULL;
-  PyObject *res = NULL;
-#if PY_MAJOR_VERSION < 3
-  if (likely(PyInt_Check(x) || PyLong_Check(x)))
-#else
-  if (likely(PyLong_Check(x)))
-#endif
-    return __Pyx_NewRef(x);
-#if CYTHON_USE_TYPE_SLOTS
-  m = Py_TYPE(x)->tp_as_number;
-  #if PY_MAJOR_VERSION < 3
-  if (m && m->nb_int) {
-    name = "int";
-    res = m->nb_int(x);
-  }
-  else if (m && m->nb_long) {
-    name = "long";
-    res = m->nb_long(x);
-  }
-  #else
-  if (likely(m && m->nb_int)) {
-    name = "int";
-    res = m->nb_int(x);
-  }
-  #endif
-#else
-  if (!PyBytes_CheckExact(x) && !PyUnicode_CheckExact(x)) {
-    res = PyNumber_Int(x);
-  }
-#endif
-  if (likely(res)) {
-#if PY_MAJOR_VERSION < 3
-    if (unlikely(!PyInt_Check(res) && !PyLong_Check(res))) {
-#else
-    if (unlikely(!PyLong_CheckExact(res))) {
-#endif
-        return __Pyx_PyNumber_IntOrLongWrongResultType(res, name);
-    }
-  }
-  else if (!PyErr_Occurred()) {
-    PyErr_SetString(PyExc_TypeError,
-                    "an integer is required");
-  }
-  return res;
-}
-static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
-  Py_ssize_t ival;
-  PyObject *x;
-#if PY_MAJOR_VERSION < 3
-  if (likely(PyInt_CheckExact(b))) {
-    if (sizeof(Py_ssize_t) >= sizeof(long))
-        return PyInt_AS_LONG(b);
-    else
-        return PyInt_AsSsize_t(x);
-  }
-#endif
-  if (likely(PyLong_CheckExact(b))) {
-    #if CYTHON_USE_PYLONG_INTERNALS
-    const digit* digits = ((PyLongObject*)b)->ob_digit;
-    const Py_ssize_t size = Py_SIZE(b);
-    if (likely(__Pyx_sst_abs(size) <= 1)) {
-        ival = likely(size) ? digits[0] : 0;
-        if (size == -1) ival = -ival;
-        return ival;
-    } else {
-      switch (size) {
-         case 2:
-           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
-             return (Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-         case -2:
-           if (8 * sizeof(Py_ssize_t) > 2 * PyLong_SHIFT) {
-             return -(Py_ssize_t) (((((size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-         case 3:
-           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
-             return (Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-         case -3:
-           if (8 * sizeof(Py_ssize_t) > 3 * PyLong_SHIFT) {
-             return -(Py_ssize_t) (((((((size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-         case 4:
-           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
-             return (Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-         case -4:
-           if (8 * sizeof(Py_ssize_t) > 4 * PyLong_SHIFT) {
-             return -(Py_ssize_t) (((((((((size_t)digits[3]) << PyLong_SHIFT) | (size_t)digits[2]) << PyLong_SHIFT) | (size_t)digits[1]) << PyLong_SHIFT) | (size_t)digits[0]));
-           }
-           break;
-      }
-    }
-    #endif
-    return PyLong_AsSsize_t(b);
-  }
-  x = PyNumber_Index(b);
-  if (!x) return -1;
-  ival = PyInt_AsSsize_t(x);
-  Py_DECREF(x);
-  return ival;
-}
-static CYTHON_INLINE PyObject * __Pyx_PyBool_FromLong(long b) {
-  return b ? __Pyx_NewRef(Py_True) : __Pyx_NewRef(Py_False);
-}
-static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
-    return PyInt_FromSize_t(ival);
-}
-
-
-#endif /* Py_PYTHON_H */
diff --git a/python/bfs/bfs_wrapper.pyx b/python/bfs/bfs_wrapper.pyx
index f0e23b6bc96..50250baeef6 100644
--- a/python/bfs/bfs_wrapper.pyx
+++ b/python/bfs/bfs_wrapper.pyx
@@ -7,148 +7,6 @@ from librmm_cffi import librmm as rmm
 #from pygdf import Column
 import numpy as np
 
-dtypes = {np.int32: GDF_INT32, np.int64: GDF_INT64, np.float32: GDF_FLOAT32, np.float64: GDF_FLOAT64}
-
-def _get_ctype_ptr(obj):
-    # The manner to access the pointers in the gdf's might change, so
-    # encapsulating access in the following 3 methods. They might also be
-    # part of future gdf versions.
-    return obj.device_ctypes_pointer.value
-
-def _get_column_data_ptr(obj):
-    return _get_ctype_ptr(obj._column._data.to_gpu_array())
-
-def _get_column_valid_ptr(obj):
-    return _get_ctype_ptr(obj._column._mask.to_gpu_array())
-
-#def _get_gdf_as_matrix_ptr(gdf):
-#    return self._get_ctype_ptr(gdf.as_gpu_matrix())
-
-cdef create_column(col):
-    
-    x= <gdf_column*>malloc(sizeof(gdf_column))
-    cdef gdf_column* c_col = <gdf_column*>malloc(sizeof(gdf_column))
-    cdef uintptr_t data_ptr = _get_column_data_ptr(col)
-    #cdef uintptr_t valid_ptr = _get_column_valid_ptr(col)
-
-    gdf_column_view_augmented(<gdf_column*>c_col,
-                              <void*> data_ptr,
-                              <gdf_valid_type*> 0,
-                              <gdf_size_type>len(col),
-                              dtypes[col.dtype.type],
-                              <gdf_size_type>col.null_count)
-    
-    cdef uintptr_t col_ptr = <uintptr_t>c_col
-    return col_ptr
-
-class Graph:
-    """
-        cuGraph graph class containing basic graph creation and transformation operations.
-    """
-    def __init__(self):
-        """
-        Returns
-        -------
-        Graph : cuGraph.Graph.
-
-        Examples
-        --------
-        >>> import cuGraph
-        >>> G = cuGraph.Graph()
-        """
-        cdef gdf_graph* graph
-        graph = <gdf_graph*>calloc(1,sizeof(gdf_graph))
-
-        cdef uintptr_t graph_ptr = <uintptr_t>graph
-        self.graph_ptr = graph_ptr
-
-
-    def add_edge_list(self, source_col, dest_col, value_col=None):
-        """
-        Warp existing gdf columns representing an edge list in a gdf_graph. cuGraph does not own the memory used to represent this graph. This function does not allocate memory. 
-        The cuGraph graph should not already contain the connectivity information as an edge list.
-        If successful, the cuGraph graph descriptor contains the newly added edge list (edge_data is optional).
-
-        Parameters
-        ----------
-        source_indices : gdf_column       
-            This gdf_column of size E (number of edges) contains the index of the source for each edge.
-            Indices must be in the range [0, V-1]. 
-        destination_indices   : gdf_column
-            This gdf_column of size E (number of edges) contains the index of the destination for each edge. 
-            Indices must be in the range [0, V-1].
-        edge_data (optional)  : gdf_column
-            This pointer can be ``none``. If not, this gdf_column of size E (number of edges) contains the weiht for each edge. 
-            The type expected to be floating point.
-
-        Examples
-        --------
-        >>> import cuGraph
-        >>> import cudf
-        >>> from scipy.io import mmread
-        >>> M = ReadMtxFile(graph_file)
-        >>> sources = cudf.Series(M.row)
-        >>> destinations = cudf.Series(M.col)
-        >>> G = cuGraph.Graph()
-        >>> G.add_edge_list(sources,destinations,none)
-        
-        """
-
-        cdef uintptr_t graph = self.graph_ptr
-        cdef uintptr_t source=create_column(source_col)
-        cdef uintptr_t dest=create_column(dest_col)
-        cdef uintptr_t value
-        if value_col is None:
-            value = 0
-        else:
-            value=create_column(value_col)
-
-        gdf_edge_list_view(<gdf_graph*>graph,
-                       <gdf_column*>source,
-                       <gdf_column*>dest,
-                       <gdf_column*>value)
-    
-    def view_edge_list(self):
-        ##TO DO
-        """
-        Display the edge list.
-        """
-        cdef uintptr_t graph = self.graph_ptr
-        cdef gdf_graph* g = <gdf_graph*>graph
-        size = g.edgeList.src_indices.size
-        print(size)
-        cdef object cffi_view = <object>g.edgeList.src_indices
-        data = cudf._gdf.cffi_view_to_column_mem(cffi_view)
-        #return pygdf.Series(data)        
-        return 0
-
-    def add_adj_list(self, offsets_col, indices_col, value_col):
-        """
-        Warp existing gdf columns representing an adjacency list in a gdf_graph.
-        """
-        ##TO TEST
-        cdef uintptr_t graph = self.graph_ptr
-        cdef uintptr_t offsets=create_column(offsets_col)
-        cdef uintptr_t indices=create_column(indices_col)
-        cdef uintptr_t value
-        if value_col is None:
-            value = 0
-        else:
-            value=create_column(value_col)
-    
-        gdf_adj_list_view(<gdf_graph*>graph,
-                       <gdf_column*>offsets,
-                       <gdf_column*>indices,
-                       <gdf_column*>value)
-
-
-    def add_transpose(self):
-        """
-        Compute the transposed adjacency list from the edge list and add it to the existing graph.
-        """
-        cdef uintptr_t graph = self.graph_ptr
-        gdf_add_transpose(<gdf_graph*>graph)
-
 cpdef bfs(G, start, directed=True):
     """
     Find the distances and predecessors for a breadth first traversal of a graph.
diff --git a/python/cugraph.pyx b/python/cugraph.pyx
index 5959837b62b..6390cbb558b 100644
--- a/python/cugraph.pyx
+++ b/python/cugraph.pyx
@@ -3,3 +3,4 @@ include "pagerank/pagerank_wrapper.pyx"
 include "jaccard/jaccard_wrapper.pyx"
 include "grmat/grmat_wrapper.pyx"
 include "louvain/louvain_wrapper.pyx"
+include "bfs/bfs_wrapper.pyx"
\ No newline at end of file

From 1e95968dbb0d2a9a7de8ddeda23cc4ab69d5bf10 Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Thu, 31 Jan 2019 15:17:16 -0700
Subject: [PATCH 3/6] Tests for BFS added and working

---
 python/bfs/bfs_wrapper.pyx | 24 ++++++++-----
 python/bfs/c_bfs.pxd       | 71 +-------------------------------------
 python/bfs/test_bfs.py     | 69 ++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 78 deletions(-)
 create mode 100644 python/bfs/test_bfs.py

diff --git a/python/bfs/bfs_wrapper.pyx b/python/bfs/bfs_wrapper.pyx
index 50250baeef6..3767f613ef6 100644
--- a/python/bfs/bfs_wrapper.pyx
+++ b/python/bfs/bfs_wrapper.pyx
@@ -25,9 +25,10 @@ cpdef bfs(G, start, directed=True):
     
     Returns
     -------
-    distances, predecessors : cudf.Series
-        distances gives the path distance for each vertex from the starting vertex
-        predecessors gives for each vertex the vertex it was reached from in the traversal
+    df : cudf.DataFrame
+        df['vertex'][i] gives the vertex id of the i'th vertex
+        df['distance'][i] gives the path distance for the i'th vertex from the starting vertex
+        df['predecessor'][i] gives for the i'th vertex the vertex it was reached from in the traversal
         
     Examples
     --------
@@ -42,10 +43,17 @@ cpdef bfs(G, start, directed=True):
     cdef uintptr_t graph = G.graph_ptr
     cdef gdf_graph* g = <gdf_graph*>graph
     num_verts = g.adjList.offsets.size - 1
-    distances = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    cdef uintptr_t distances_ptr = create_column(distances)
-    predecessors = cudf.Series(np.zeros(num_verts, dtype=np.int32))
-    cdef uintptr_t predecessors_ptr = create_column(distances)
+    
+    df = cudf.DataFrame()
+    df['vertex'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    cdef uintptr_t vertex_ptr = create_column(df['vertex'])
+    df['distance'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    cdef uintptr_t distances_ptr = create_column(df['distance'])
+    df['predecessor'] = cudf.Series(np.zeros(num_verts, dtype=np.int32))
+    cdef uintptr_t predecessors_ptr = create_column(df['predecessor'])
+    
+    err = g.adjList.get_vertex_identifiers(<gdf_column*>vertex_ptr)
+    cudf.bindings.cudf_cpp.check_gdf_error(err)
     
     gdf_bfs(<gdf_graph*>g, <gdf_column*>distances_ptr, <gdf_column*>predecessors_ptr, <int>start, <bool>directed)
-    return distances, predecessors
\ No newline at end of file
+    return df
\ No newline at end of file
diff --git a/python/bfs/c_bfs.pxd b/python/bfs/c_bfs.pxd
index ec4e8f8808e..7c97e351fd0 100644
--- a/python/bfs/c_bfs.pxd
+++ b/python/bfs/c_bfs.pxd
@@ -1,75 +1,6 @@
-
+from c_graph cimport *
 from libcpp cimport bool
 
-cdef extern from "cudf.h":
-
-    ctypedef enum gdf_error: 
-        pass
-
-    ctypedef enum gdf_dtype:
-        GDF_invalid=0,
-        GDF_INT8,
-        GDF_INT16,
-        GDF_INT32,
-        GDF_INT64,
-        GDF_FLOAT32,
-        GDF_FLOAT64,
-        GDF_DATE32,     
-        GDF_DATE64,     
-        GDF_TIMESTAMP,  
-        GDF_CATEGORY,
-        GDF_STRING,
-        N_GDF_TYPES
-
-    ctypedef unsigned char gdf_valid_type
-    ctypedef size_t gdf_size_type
- 
-    struct gdf_column_:
-        void *data                       
-        gdf_valid_type *valid
-        gdf_size_type size             
-        gdf_dtype dtype
-
-
-    ctypedef gdf_column_ gdf_column
-
-    cdef gdf_error gdf_column_view_augmented(gdf_column *column, 
-                              void *data, 
-                              gdf_valid_type *valid,
-                              gdf_size_type size, 
-                              gdf_dtype dtype,
-                              gdf_size_type null_count)
-
-    cdef gdf_error gdf_column_view_new(gdf_column *column,
-                              void *data)
-
 cdef extern from "cugraph.h":
 
-    struct gdf_edge_list:
-        gdf_column *src_indices
-        gdf_column *dest_indices
-        gdf_column *edge_data
-
-    struct gdf_adj_list:
-        gdf_column *offsets
-        gdf_column *indices
-        gdf_column *edge_data
-
-    struct gdf_graph:
-        gdf_edge_list *edgeList
-        gdf_adj_list *adjList
-        gdf_adj_list *transposedAdjList
-        
-    cdef gdf_error gdf_edge_list_view(gdf_graph *graph, 
-                             const gdf_column *source_indices,
-                             const gdf_column *destination_indices,
-                             const gdf_column *edge_data)
-    
-    cdef gdf_error gdf_adj_list_view (gdf_graph *graph, 
-                             const gdf_column *offsets,
-                             const gdf_column *indices,
-                             const gdf_column *edge_data)
-
-    cdef gdf_error gdf_add_transpose(gdf_graph *graph)
-
     cdef gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_node, bool directed)
diff --git a/python/bfs/test_bfs.py b/python/bfs/test_bfs.py
new file mode 100644
index 00000000000..356a5d505b5
--- /dev/null
+++ b/python/bfs/test_bfs.py
@@ -0,0 +1,69 @@
+import cugraph
+import cudf
+import time
+from scipy.io import mmread
+import pytest
+import numpy as np
+
+def ReadMtxFile(mmFile):
+    print('Reading ' + str(mmFile) + '...')
+    return mmread(mmFile).asfptype()
+
+
+def cugraph_Call(M, start_vertex):
+
+    # Device data
+    M = M.tocsr()
+    sources = cudf.Series(M.indptr)
+    destinations = cudf.Series(M.indices)
+    values = cudf.Series(M.data)
+    
+    G = cugraph.Graph()
+    G.add_adj_list(sources, destinations, values)
+    
+    t1 = time.time()
+    df = cugraph.bfs(G, start_vertex)
+    t2 = time.time() - t1
+    print('Time : '+str(t2))
+
+    # Return distances as np.array()
+    return np.array(df['distance'])
+
+
+def base_Call(M, start_vertex):
+    intMax = 2147483647
+    M = M.tocsr()
+    offsets = M.indptr
+    indices = M.indices
+    num_verts = len(offsets) - 1
+    dist = np.zeros(num_verts, dtype=np.int32)
+    
+    for i in range(num_verts):
+        dist[i] = intMax
+    import queue
+    q = queue.Queue()
+    q.put(start_vertex)
+    dist[start_vertex] = 0
+    while(not q.empty()):
+        u = q.get()
+        for iCol in range(offsets[u],offsets[u + 1]):
+            v = indices[iCol]
+            if (dist[v] == intMax):
+                dist[v] = dist[u] + 1
+                q.put(v)
+    return dist
+
+datasets = ['/datasets/networks/dolphins.mtx',
+            '/datasets/networks/karate.mtx',
+            '/datasets/golden_data/graphs/dblp.mtx']
+
+@pytest.mark.parametrize('graph_file', datasets)
+def test_bfs(graph_file):
+
+    M = ReadMtxFile(graph_file)
+    base_dist = base_Call(M, 0)
+    dist = cugraph_Call(M, 0)
+    
+    assert len(base_dist) == len(dist)
+    for i in range(len(dist)):
+        assert base_dist[i] == dist[i]
\ No newline at end of file

From f8086ed0ae50b5d6b67c88631375d7a5fbc385af Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Fri, 1 Feb 2019 11:12:34 -0700
Subject: [PATCH 4/6] Removed static CUB, using CUB included by Gunrock instead

---
 external/cub/CHANGE_LOG.TXT                   |  381 ---
 external/cub/LICENSE.TXT                      |   24 -
 external/cub/README.md                        |  128 -
 external/cub/common.mk                        |  233 --
 external/cub/cub/agent/agent_histogram.cuh    |  787 ------
 .../cub/agent/agent_radix_sort_downsweep.cuh  |  772 ------
 .../cub/agent/agent_radix_sort_upsweep.cuh    |  526 ----
 external/cub/cub/agent/agent_reduce.cuh       |  385 ---
 .../cub/cub/agent/agent_reduce_by_key.cuh     |  549 -----
 external/cub/cub/agent/agent_rle.cuh          |  837 -------
 external/cub/cub/agent/agent_scan.cuh         |  471 ----
 .../cub/cub/agent/agent_segment_fixup.cuh     |  375 ---
 external/cub/cub/agent/agent_select_if.cuh    |  703 ------
 external/cub/cub/agent/agent_spmv_orig.cuh    |  670 ------
 .../cub/agent/single_pass_scan_operators.cuh  |  815 -------
 .../cub/block/block_adjacent_difference.cuh   |  596 -----
 .../cub/cub/block/block_discontinuity.cuh     | 1148 ---------
 external/cub/cub/block/block_exchange.cuh     | 1248 ----------
 external/cub/cub/block/block_histogram.cuh    |  415 ----
 external/cub/cub/block/block_load.cuh         | 1268 ----------
 external/cub/cub/block/block_radix_rank.cuh   |  697 ------
 external/cub/cub/block/block_radix_sort.cuh   |  862 -------
 .../cub/cub/block/block_raking_layout.cuh     |  152 --
 external/cub/cub/block/block_reduce.cuh       |  607 -----
 external/cub/cub/block/block_scan.cuh         | 2126 ----------------
 external/cub/cub/block/block_shuffle.cuh      |  305 ---
 external/cub/cub/block/block_store.cuh        | 1000 --------
 .../block_histogram_atomic.cuh                |   82 -
 .../specializations/block_histogram_sort.cuh  |  226 --
 .../specializations/block_reduce_raking.cuh   |  222 --
 .../block_reduce_raking_commutative_only.cuh  |  199 --
 .../block_reduce_warp_reductions.cuh          |  222 --
 .../specializations/block_scan_raking.cuh     |  666 -----
 .../specializations/block_scan_warp_scans.cuh |  392 ---
 .../block_scan_warp_scans2.cuh                |  436 ----
 .../block_scan_warp_scans3.cuh                |  418 ----
 external/cub/cub/cub.cuh                      |   95 -
 external/cub/cub/device/device_histogram.cuh  |  866 -------
 external/cub/cub/device/device_partition.cuh  |  273 ---
 external/cub/cub/device/device_radix_sort.cuh |  796 ------
 external/cub/cub/device/device_reduce.cuh     |  734 ------
 .../cub/device/device_run_length_encode.cuh   |  278 ---
 external/cub/cub/device/device_scan.cuh       |  443 ----
 .../device/device_segmented_radix_sort.cuh    |  875 -------
 .../cub/device/device_segmented_reduce.cuh    |  619 -----
 external/cub/cub/device/device_select.cuh     |  369 ---
 external/cub/cub/device/device_spmv.cuh       |  174 --
 .../device/dispatch/dispatch_histogram.cuh    | 1096 ---------
 .../device/dispatch/dispatch_radix_sort.cuh   | 1652 -------------
 .../cub/device/dispatch/dispatch_reduce.cuh   |  882 -------
 .../dispatch/dispatch_reduce_by_key.cuh       |  554 -----
 .../cub/cub/device/dispatch/dispatch_rle.cuh  |  538 -----
 .../cub/cub/device/dispatch/dispatch_scan.cuh |  563 -----
 .../device/dispatch/dispatch_select_if.cuh    |  542 -----
 .../device/dispatch/dispatch_spmv_orig.cuh    |  834 -------
 external/cub/cub/grid/grid_barrier.cuh        |  211 --
 external/cub/cub/grid/grid_even_share.cuh     |  222 --
 external/cub/cub/grid/grid_mapping.cuh        |  113 -
 external/cub/cub/grid/grid_queue.cuh          |  220 --
 external/cub/cub/host/mutex.cuh               |  171 --
 .../cub/iterator/arg_index_input_iterator.cuh |  259 --
 .../cache_modified_input_iterator.cuh         |  240 --
 .../cache_modified_output_iterator.cuh        |  254 --
 .../cub/iterator/constant_input_iterator.cuh  |  235 --
 .../cub/iterator/counting_input_iterator.cuh  |  228 --
 .../cub/iterator/discard_output_iterator.cuh  |  220 --
 .../cub/iterator/tex_obj_input_iterator.cuh   |  310 ---
 .../cub/iterator/tex_ref_input_iterator.cuh   |  374 ---
 .../cub/iterator/transform_input_iterator.cuh |  252 --
 external/cub/cub/thread/thread_load.cuh       |  438 ----
 external/cub/cub/thread/thread_operators.cuh  |  317 ---
 external/cub/cub/thread/thread_reduce.cuh     |  152 --
 external/cub/cub/thread/thread_scan.cuh       |  268 ---
 external/cub/cub/thread/thread_search.cuh     |  154 --
 external/cub/cub/thread/thread_store.cuh      |  422 ----
 external/cub/cub/util_allocator.cuh           |  708 ------
 external/cub/cub/util_arch.cuh                |  151 --
 external/cub/cub/util_debug.cuh               |  145 --
 external/cub/cub/util_device.cuh              |  347 ---
 external/cub/cub/util_macro.cuh               |  103 -
 external/cub/cub/util_namespace.cuh           |   46 -
 external/cub/cub/util_ptx.cuh                 |  729 ------
 external/cub/cub/util_type.cuh                | 1141 ---------
 .../warp/specializations/warp_reduce_shfl.cuh |  551 -----
 .../warp/specializations/warp_reduce_smem.cuh |  375 ---
 .../warp/specializations/warp_scan_shfl.cuh   |  656 -----
 .../warp/specializations/warp_scan_smem.cuh   |  397 ---
 external/cub/cub/warp/warp_reduce.cuh         |  612 -----
 external/cub/cub/warp/warp_scan.cuh           |  936 -------
 external/cub/eclipse code style profile.xml   |  155 --
 external/cub/examples/block/Makefile          |  128 -
 .../block/example_block_radix_sort.cu         |  323 ---
 .../examples/block/example_block_reduce.cu    |  290 ---
 .../cub/examples/block/example_block_scan.cu  |  334 ---
 external/cub/examples/block/reduce_by_key.cu  |   57 -
 external/cub/examples/device/Makefile         |  197 --
 .../example_device_partition_flagged.cu       |  233 --
 .../device/example_device_partition_if.cu     |  244 --
 .../device/example_device_radix_sort.cu       |  226 --
 .../examples/device/example_device_reduce.cu  |  180 --
 .../examples/device/example_device_scan.cu    |  186 --
 .../device/example_device_select_flagged.cu   |  233 --
 .../device/example_device_select_if.cu        |  242 --
 .../device/example_device_select_unique.cu    |  221 --
 ...ample_device_sort_find_non_trivial_runs.cu |  384 ---
 external/cub/experimental/Makefile            |  125 -
 .../experimental/defunct/example_coo_spmv.cu  | 1070 --------
 .../defunct/test_device_seg_reduce.cu         | 2142 -----------------
 .../experimental/histogram/histogram_cub.h    |  109 -
 .../histogram/histogram_gmem_atomics.h        |  185 --
 .../histogram/histogram_smem_atomics.h        |  195 --
 .../cub/experimental/histogram_compare.cu     |  635 -----
 external/cub/experimental/sparse_matrix.h     | 1244 ----------
 external/cub/experimental/spmv_compare.cu     |  917 -------
 external/cub/experimental/spmv_script.sh      |   30 -
 external/cub/test/Makefile                    |  453 ----
 external/cub/test/link_a.cu                   |   11 -
 external/cub/test/link_b.cu                   |   11 -
 external/cub/test/link_main.cpp               |   10 -
 external/cub/test/mersenne.h                  |  160 --
 external/cub/test/test_allocator.cu           |  459 ----
 external/cub/test/test_block_histogram.cu     |  310 ---
 external/cub/test/test_block_load_store.cu    |  549 -----
 external/cub/test/test_block_radix_sort.cu    |  717 ------
 external/cub/test/test_block_reduce.cu        |  822 -------
 external/cub/test/test_block_scan.cu          |  929 -------
 external/cub/test/test_device_histogram.cu    | 1669 -------------
 external/cub/test/test_device_radix_sort.cu   | 1275 ----------
 external/cub/test/test_device_reduce.cu       | 1339 -----------
 .../cub/test/test_device_reduce_by_key.cu     |  853 -------
 .../cub/test/test_device_run_length_encode.cu |  890 -------
 external/cub/test/test_device_scan.cu         | 1015 --------
 external/cub/test/test_device_select_if.cu    | 1039 --------
 .../cub/test/test_device_select_unique.cu     |  651 -----
 external/cub/test/test_grid_barrier.cu        |  152 --
 external/cub/test/test_iterator.cu            |  805 -------
 external/cub/test/test_util.h                 | 1600 ------------
 external/cub/test/test_warp_reduce.cu         |  840 -------
 external/cub/test/test_warp_scan.cu           |  630 -----
 external/cub/tune/Makefile                    |  192 --
 external/cub/tune/tune_device_reduce.cu       |  763 ------
 141 files changed, 73512 deletions(-)
 delete mode 100644 external/cub/CHANGE_LOG.TXT
 delete mode 100644 external/cub/LICENSE.TXT
 delete mode 100644 external/cub/README.md
 delete mode 100644 external/cub/common.mk
 delete mode 100644 external/cub/cub/agent/agent_histogram.cuh
 delete mode 100644 external/cub/cub/agent/agent_radix_sort_downsweep.cuh
 delete mode 100644 external/cub/cub/agent/agent_radix_sort_upsweep.cuh
 delete mode 100644 external/cub/cub/agent/agent_reduce.cuh
 delete mode 100644 external/cub/cub/agent/agent_reduce_by_key.cuh
 delete mode 100644 external/cub/cub/agent/agent_rle.cuh
 delete mode 100644 external/cub/cub/agent/agent_scan.cuh
 delete mode 100644 external/cub/cub/agent/agent_segment_fixup.cuh
 delete mode 100644 external/cub/cub/agent/agent_select_if.cuh
 delete mode 100644 external/cub/cub/agent/agent_spmv_orig.cuh
 delete mode 100644 external/cub/cub/agent/single_pass_scan_operators.cuh
 delete mode 100644 external/cub/cub/block/block_adjacent_difference.cuh
 delete mode 100644 external/cub/cub/block/block_discontinuity.cuh
 delete mode 100644 external/cub/cub/block/block_exchange.cuh
 delete mode 100644 external/cub/cub/block/block_histogram.cuh
 delete mode 100644 external/cub/cub/block/block_load.cuh
 delete mode 100644 external/cub/cub/block/block_radix_rank.cuh
 delete mode 100644 external/cub/cub/block/block_radix_sort.cuh
 delete mode 100644 external/cub/cub/block/block_raking_layout.cuh
 delete mode 100644 external/cub/cub/block/block_reduce.cuh
 delete mode 100644 external/cub/cub/block/block_scan.cuh
 delete mode 100644 external/cub/cub/block/block_shuffle.cuh
 delete mode 100644 external/cub/cub/block/block_store.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_histogram_atomic.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_histogram_sort.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_reduce_raking.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_scan_raking.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
 delete mode 100644 external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
 delete mode 100644 external/cub/cub/cub.cuh
 delete mode 100644 external/cub/cub/device/device_histogram.cuh
 delete mode 100644 external/cub/cub/device/device_partition.cuh
 delete mode 100644 external/cub/cub/device/device_radix_sort.cuh
 delete mode 100644 external/cub/cub/device/device_reduce.cuh
 delete mode 100644 external/cub/cub/device/device_run_length_encode.cuh
 delete mode 100644 external/cub/cub/device/device_scan.cuh
 delete mode 100644 external/cub/cub/device/device_segmented_radix_sort.cuh
 delete mode 100644 external/cub/cub/device/device_segmented_reduce.cuh
 delete mode 100644 external/cub/cub/device/device_select.cuh
 delete mode 100644 external/cub/cub/device/device_spmv.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_histogram.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_reduce.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_rle.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_scan.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_select_if.cuh
 delete mode 100644 external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
 delete mode 100644 external/cub/cub/grid/grid_barrier.cuh
 delete mode 100644 external/cub/cub/grid/grid_even_share.cuh
 delete mode 100644 external/cub/cub/grid/grid_mapping.cuh
 delete mode 100644 external/cub/cub/grid/grid_queue.cuh
 delete mode 100644 external/cub/cub/host/mutex.cuh
 delete mode 100644 external/cub/cub/iterator/arg_index_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/cache_modified_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/cache_modified_output_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/constant_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/counting_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/discard_output_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/tex_obj_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/tex_ref_input_iterator.cuh
 delete mode 100644 external/cub/cub/iterator/transform_input_iterator.cuh
 delete mode 100644 external/cub/cub/thread/thread_load.cuh
 delete mode 100644 external/cub/cub/thread/thread_operators.cuh
 delete mode 100644 external/cub/cub/thread/thread_reduce.cuh
 delete mode 100644 external/cub/cub/thread/thread_scan.cuh
 delete mode 100644 external/cub/cub/thread/thread_search.cuh
 delete mode 100644 external/cub/cub/thread/thread_store.cuh
 delete mode 100644 external/cub/cub/util_allocator.cuh
 delete mode 100644 external/cub/cub/util_arch.cuh
 delete mode 100644 external/cub/cub/util_debug.cuh
 delete mode 100644 external/cub/cub/util_device.cuh
 delete mode 100644 external/cub/cub/util_macro.cuh
 delete mode 100644 external/cub/cub/util_namespace.cuh
 delete mode 100644 external/cub/cub/util_ptx.cuh
 delete mode 100644 external/cub/cub/util_type.cuh
 delete mode 100644 external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
 delete mode 100644 external/cub/cub/warp/specializations/warp_reduce_smem.cuh
 delete mode 100644 external/cub/cub/warp/specializations/warp_scan_shfl.cuh
 delete mode 100644 external/cub/cub/warp/specializations/warp_scan_smem.cuh
 delete mode 100644 external/cub/cub/warp/warp_reduce.cuh
 delete mode 100644 external/cub/cub/warp/warp_scan.cuh
 delete mode 100644 external/cub/eclipse code style profile.xml
 delete mode 100644 external/cub/examples/block/Makefile
 delete mode 100644 external/cub/examples/block/example_block_radix_sort.cu
 delete mode 100644 external/cub/examples/block/example_block_reduce.cu
 delete mode 100644 external/cub/examples/block/example_block_scan.cu
 delete mode 100644 external/cub/examples/block/reduce_by_key.cu
 delete mode 100644 external/cub/examples/device/Makefile
 delete mode 100644 external/cub/examples/device/example_device_partition_flagged.cu
 delete mode 100644 external/cub/examples/device/example_device_partition_if.cu
 delete mode 100644 external/cub/examples/device/example_device_radix_sort.cu
 delete mode 100644 external/cub/examples/device/example_device_reduce.cu
 delete mode 100644 external/cub/examples/device/example_device_scan.cu
 delete mode 100644 external/cub/examples/device/example_device_select_flagged.cu
 delete mode 100644 external/cub/examples/device/example_device_select_if.cu
 delete mode 100644 external/cub/examples/device/example_device_select_unique.cu
 delete mode 100644 external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
 delete mode 100644 external/cub/experimental/Makefile
 delete mode 100644 external/cub/experimental/defunct/example_coo_spmv.cu
 delete mode 100644 external/cub/experimental/defunct/test_device_seg_reduce.cu
 delete mode 100644 external/cub/experimental/histogram/histogram_cub.h
 delete mode 100644 external/cub/experimental/histogram/histogram_gmem_atomics.h
 delete mode 100644 external/cub/experimental/histogram/histogram_smem_atomics.h
 delete mode 100644 external/cub/experimental/histogram_compare.cu
 delete mode 100644 external/cub/experimental/sparse_matrix.h
 delete mode 100644 external/cub/experimental/spmv_compare.cu
 delete mode 100755 external/cub/experimental/spmv_script.sh
 delete mode 100644 external/cub/test/Makefile
 delete mode 100644 external/cub/test/link_a.cu
 delete mode 100644 external/cub/test/link_b.cu
 delete mode 100644 external/cub/test/link_main.cpp
 delete mode 100644 external/cub/test/mersenne.h
 delete mode 100644 external/cub/test/test_allocator.cu
 delete mode 100644 external/cub/test/test_block_histogram.cu
 delete mode 100644 external/cub/test/test_block_load_store.cu
 delete mode 100644 external/cub/test/test_block_radix_sort.cu
 delete mode 100644 external/cub/test/test_block_reduce.cu
 delete mode 100644 external/cub/test/test_block_scan.cu
 delete mode 100644 external/cub/test/test_device_histogram.cu
 delete mode 100644 external/cub/test/test_device_radix_sort.cu
 delete mode 100644 external/cub/test/test_device_reduce.cu
 delete mode 100644 external/cub/test/test_device_reduce_by_key.cu
 delete mode 100644 external/cub/test/test_device_run_length_encode.cu
 delete mode 100644 external/cub/test/test_device_scan.cu
 delete mode 100644 external/cub/test/test_device_select_if.cu
 delete mode 100644 external/cub/test/test_device_select_unique.cu
 delete mode 100644 external/cub/test/test_grid_barrier.cu
 delete mode 100644 external/cub/test/test_iterator.cu
 delete mode 100644 external/cub/test/test_util.h
 delete mode 100644 external/cub/test/test_warp_reduce.cu
 delete mode 100644 external/cub/test/test_warp_scan.cu
 delete mode 100644 external/cub/tune/Makefile
 delete mode 100644 external/cub/tune/tune_device_reduce.cu

diff --git a/external/cub/CHANGE_LOG.TXT b/external/cub/CHANGE_LOG.TXT
deleted file mode 100644
index 43860e691e7..00000000000
--- a/external/cub/CHANGE_LOG.TXT
+++ /dev/null
@@ -1,381 +0,0 @@
-1.7.4    09/20/2017
-    - Bug fixes: 
-        - Issue #114: Can't pair non-trivially-constructible values in radix sort
-        - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 
-          		  
-//-----------------------------------------------------------------------------
-
-1.7.3    08/28/2017
-    - Bug fixes: 
-        - Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs
-          		  
-//-----------------------------------------------------------------------------
-
-1.7.2    08/26/2017
-    - Bug fixes: 
-        - Issue #104: Device-wide reduction is now "run-to-run" deterministic for 
-          pseudo-associative reduction operators (like floating point addition)
-          		  
-//-----------------------------------------------------------------------------
-
-1.7.1    08/18/2017
-    - Updated Volta radix sorting tuning policies 
-    - Bug fixes: 
-        - Issue #104 (uint64_t warp-reduce broken for cub 1.7.0 on cuda 8 and older)
-        - Issue #103 (Can't mix Thrust 9.0 and CUB)
-        - Issue #102 (CUB pulls in windows.h which defines min/max macros that conflict with std::min/std::max)
-        - Issue #99 (Radix sorting crashes NVCC on Windows 10 for SM52)
-        - Issue #98 (cuda-memcheck: --tool initcheck failed with lineOfSight)
-        - Issue #94 (Git clone size)
-        - Issue #93 (accept iterators for segment offsets)
-        - Issue #87 (CUB uses anonymous unions which is not valid C++)
-        - Issue #44 (Check for C++ 11 should be changed that Visual Studio 2013 is also recognized as C++ 11 capable)
-          		  
-//-----------------------------------------------------------------------------
-
-1.7.0    06/07/2017
-    - Compatible with CUDA9 and SM7.x (Volta) independent thread scheduling 
-    - API change: remove cub::WarpAll() and cub::WarpAny().  These functions served to 
-      emulate __all and __any functionality for SM1.x devices, which did not have those 
-      operations.  However, the SM1.x devices are now deprecated in CUDA, and the 
-      interfaces of the these two functions are now lacking the lane-mask needed 
-      for collectives to run on Volta SMs having independent thread scheduling. 
-    - Bug fixes: 
-        - Issue #86 Incorrect results with ReduceByKey
-          		  
-//-----------------------------------------------------------------------------
-
-1.6.4    12/06/2016
-    - Updated sm_5x, sm_6x tuning policies for radix sorting (3.5B and 3.4B 
-      32b keys/s on TitanX and GTX 1080, respectively)
-    - Bug fixes: 
-        - Restore fence work-around for scan (reduce-by-key, etc.) hangs 
-          in CUDA 8.5
-        - Issue 65: DeviceSegmentedRadixSort should allow inputs to have 
-          pointer-to-const type 
-        - Mollify Clang device-side warnings
-        - Remove out-dated VC project files
-          		  
-//-----------------------------------------------------------------------------
-
-1.6.3    11/20/2016
-    - API change: BlockLoad and BlockStore are now templated by the local
-      data type, instead of the Iterator type.  This allows for output iterators
-      having \p void as their \p value_type (e.g., discard iterators).
-    - Updated GP100 tuning policies for radix sorting (6.2B 32b keys/s)
-    - Bug fixes: 
-        - Issue #74: Warpreduce executes reduction operator for out-of-bounds items
-        - Issue #72 (cub:InequalityWrapper::operator() should be non-const)
-        - Issue #71 (KeyVairPair won't work if Key has non-trivial ctor)
-		- Issue #70 1.5.3 breaks BlockScan API.  Retroactively reversioned
-		  from v1.5.3 -> v1.6 to appropriately indicate API change.
-		- Issue #69 cub::BlockStore::Store doesn't compile if OutputIteratorT::value_type != T  
-        - Issue #68 (cub::TilePrefixCallbackOp::WarpReduce doesn't permit ptx 
-          arch specialization)
-		- Improved support for Win32 platforms (warnings, alignment, etc)
-		  
-//-----------------------------------------------------------------------------
-
-1.6.2 (was 1.5.5)    10/25/2016
-    - Updated Pascal tuning policies for radix sorting
-    - Bug fixes: 
-        - Fix for arm64 compilation of caching allocator
-
-//-----------------------------------------------------------------------------
-
-1.6.1 (was 1.5.4)    10/14/2016
-    - Bug fixes: 
-        - Fix for radix sorting bug introduced by scan refactorization
-
-//-----------------------------------------------------------------------------
-
-1.6.0 (was 1.5.3)    10/11/2016
-    - API change: Device/block/warp-wide exclusive scans have been revised to now 
-      accept an "initial value" (instead of an "identity value") for seeding the 
-      computation with an arbitrary prefix.  
-    - API change: Device-wide reductions and scans can now have input sequence types that are 
-      different from output sequence types (as long as they are coercible)
-      value") for seeding the computation with an arbitrary prefix
-    - Reduce repository size (move doxygen binary to doc repository)
-    - Minor reductions in block-scan instruction count
-    - Bug fixes: 
-        - Issue #55: warning in cub/device/dispatch/dispatch_reduce_by_key.cuh 
-        - Issue #59: cub::DeviceScan::ExclusiveSum can't prefix sum of float into double
-        - Issue #58: Infinite loop in cub::CachingDeviceAllocator::NearestPowerOf
-        - Issue #47: Caching allocator needs to clean up cuda error upon successful retry 
-        - Issue #46: Very high amount of needed memory from the cub::DeviceHistogram::HistogramEven routine
-        - Issue #45: Caching Device Allocator fails with debug output enabled
-        - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
-
-//-----------------------------------------------------------------------------
-
-1.5.2    03/21/2016
-	- Improved medium-size scan performance for sm5x (Maxwell)
-    - Refactored caching allocator for device memory
-   		- Spends less time locked
-		- Failure to allocate a block from the runtime will retry once after
-		  freeing cached allocations
-		- Now respects max-bin (issue where blocks in excess of max-bin were
-		  still being retained in free cache)
-		- Uses C++11 mutex when available
-    - Bug fixes: 
-        - Fix for generic-type reduce-by-key warpscan (sm3.x and newer)
-          
-//-----------------------------------------------------------------------------
-
-1.5.1    12/28/2015
-    - Bug fixes: 
-        - Fix for incorrect DeviceRadixSort output for some small problems on 
-          Maxwell SM52 architectures
-        - Fix for macro redefinition warnings when compiling with Thrust sort
-          
-//-----------------------------------------------------------------------------
-
-1.5.0    12/14/2015
-    - New Features:
-        - Added new segmented device-wide operations for device-wide sort and 
-          reduction primitives.
-    - Bug fixes: 
-        - Fix for Git Issue 36 (Compilation error with GCC 4.8.4 nvcc 7.0.27) and
-          Forums thread (ThreadLoad generates compiler errors when loading from 
-          pointer-to-const)
-        - Fix for Git Issue 29 (DeviceRadixSort::SortKeys<bool> yields compiler 
-          errors)
-        - Fix for Git Issue 26 (CUDA error: misaligned address after 
-          cub::DeviceRadixSort::SortKeys())
-        - Fix for incorrect/crash on 0-length problems, e.g., Git Issue 25 (Floating 
-          point exception (core dumped) during cub::DeviceRadixSort::SortKeys)
-        - Fix for CUDA 7.5 issues on SM 5.2 with SHFL-based warp-scan and warp-reduction 
-          on non-primitive data types (e.g., user-defined structs)
-        - Fix for small radix sorting problems where 0 temporary bytes were 
-          required and users code was invoking malloc(0) on some systems where
-          that returns NULL.  (Impl assumed was asking for size again and was not 
-          running the sort.)
-          
-//-----------------------------------------------------------------------------
-
-1.4.1    04/13/2015
-    - Bug fixes: 
-        - Fixes for CUDA 7.0 issues with SHFL-based warp-scan and warp-reduction 
-          on non-primitive data types (e.g., user-defined structs)
-        - Fixes for minor CUDA 7.0 performance regressions in cub::DeviceScan,
-          DeviceReduceByKey
-        - Fixes to allow cub::DeviceRadixSort and cub::BlockRadixSort on bool types
-        - Remove requirement for callers to define the CUB_CDP macro 
-          when invoking CUB device-wide rountines using CUDA dynamic parallelism
-        - Fix for headers not being included in the proper order (or missing includes)
-          for some block-wide functions
-          
-//-----------------------------------------------------------------------------
-
-1.4.0    03/18/2015
-    - New Features:
-		- Support and performance tuning for new Maxwell GPU architectures
-        - Updated cub::DeviceHistogram implementation that provides the same 
-          "histogram-even" and "histogram-range" functionality as IPP/NPP.
-          Provides extremely fast and, perhaps more importantly, very 
-          uniform performance response across diverse real-world datasets, 
-          including pathological (homogeneous) sample distributions (resilience)
-        - New cub::DeviceSpmv methods for multiplying sparse matrices by 
-          dense vectors, load-balanced using a merge-based parallel decomposition.
-        - New cub::DeviceRadixSort sorting entry-points that always return
-          the sorted output into the specified buffer (as opposed to the 
-          cub::DoubleBuffer in which it could end up in either buffer)
-        - New cub::DeviceRunLengthEncode::NonTrivialRuns for finding the starting 
-          offsets and lengths of all non-trivial runs (i.e., length > 1) of keys in 
-          a given sequence.  (Useful for top-down partitioning algorithms like 
-          MSD sorting of very-large keys.)
-          
-//-----------------------------------------------------------------------------
-
-1.3.2    07/28/2014
-    - Bug fixes: 
-        - Fix for cub::DeviceReduce where reductions of small problems 
-          (small enough to only dispatch a single thread block) would run in 
-          the default stream (stream zero) regardless of whether an alternate
-          stream was specified.  
-          
-//-----------------------------------------------------------------------------
-
-1.3.1    05/23/2014
-    - Bug fixes: 
-        - Workaround for a benign WAW race warning reported by cuda-memcheck
-          in BlockScan specialized for BLOCK_SCAN_WARP_SCANS algorithm.
-        - Fix for bug in DeviceRadixSort where the algorithm may sort more 
-          key bits than the caller specified (up to the nearest radix digit).
-        - Fix for ~3% DeviceRadixSort performance regression on Kepler and 
-          Fermi that was introduced in v1.3.0.  
-
-//-----------------------------------------------------------------------------
-
-1.3.0    05/12/2014
-    - New features:
-        - CUB's collective (block-wide, warp-wide) primitives underwent a minor 
-          interface refactoring:
-            - To provide the appropriate support for multidimensional thread blocks,
-              The interfaces for collective classes are now template-parameterized 
-              by X, Y, and Z block dimensions (with BLOCK_DIM_Y and BLOCK_DIM_Z being 
-              optional, and BLOCK_DIM_X replacing BLOCK_THREADS).  Furthermore, the 
-              constructors that accept remapped linear thread-identifiers have been 
-              removed: all primitives now assume a row-major thread-ranking for 
-              multidimensional thread blocks.  
-            - To allow the host program (compiled by the host-pass) to 
-              accurately determine the device-specific storage requirements for 
-              a given collective (compiled for each device-pass), the interfaces 
-              for collective classes are now (optionally) template-parameterized 
-              by the desired PTX compute capability. This is useful when 
-              aliasing collective storage to shared memory that has been 
-              allocated dynamically by the host at the kernel call site.   
-            - Most CUB programs having typical 1D usage should not require any 
-              changes to accomodate these updates.
-        - Added new "combination" WarpScan methods for efficiently computing 
-          both inclusive and exclusive prefix scans (and sums).
-    - Bug fixes: 
-        - Fixed bug in cub::WarpScan (which affected cub::BlockScan and 
-          cub::DeviceScan) where incorrect results (e.g., NAN) would often be 
-          returned when parameterized for floating-point types (fp32, fp64).
-        - Workaround-fix for ptxas error when compiling with with -G flag on Linux 
-          (for debug instrumentation) 
-        - Misc. workaround-fixes for certain scan scenarios (using custom 
-          scan operators) where code compiled for SM1x is run on newer 
-          GPUs of higher compute-capability: the compiler could not tell
-          which memory space was being used collective operations and was 
-          mistakenly using global ops instead of shared ops. 
-
-//-----------------------------------------------------------------------------
-
-1.2.3    04/01/2014
-    - Bug fixes: 
-        - Fixed access violation bug in DeviceReduce::ReduceByKey for non-primitive value types
-        - Fixed code-snippet bug in ArgIndexInputIteratorT documentation 
-
-//-----------------------------------------------------------------------------
-
-1.2.2    03/03/2014
-    - New features:
-        - Added MS VC++ project solutions for device-wide and block-wide examples 
-    - Performance:
-        - Added a third algorithmic variant of cub::BlockReduce for improved performance
-          when using commutative operators (e.g., numeric addition)
-    - Bug fixes: 
-        - Fixed bug where inclusion of Thrust headers in a certain order prevented CUB device-wide primitives from working properly
-
-//-----------------------------------------------------------------------------
-
-1.2.0    02/25/2014
-    - New features:
-        - Added device-wide reduce-by-key (DeviceReduce::ReduceByKey, DeviceReduce::RunLengthEncode) 
-    - Performance
-        - Improved DeviceScan, DeviceSelect, DevicePartition performance
-    - Documentation and testing:
-        - Compatible with CUDA 6.0
-        - Added performance-portability plots for many device-wide primitives to doc 
-        - Update doc and tests to reflect iterator (in)compatibilities with CUDA 5.0 (and older) and Thrust 1.6 (and older).
-    - Bug fixes 
-        - Revised the operation of temporary tile status bookkeeping for DeviceScan (and similar) to be safe for current code run on future platforms (now uses proper fences)  
-        - Fixed DeviceScan bug where Win32 alignment disagreements between host and device regarding user-defined data types would corrupt tile status
-        - Fixed BlockScan bug where certain exclusive scans on custom data types for the BLOCK_SCAN_WARP_SCANS variant would return incorrect results for the first thread in the block
-        - Added workaround for TexRefInputIteratorTto work with CUDA 6.0
-    
-//-----------------------------------------------------------------------------
-
-1.1.1    12/11/2013
-    - New features:
-        - Added TexObjInputIteratorT, TexRefInputIteratorT, CacheModifiedInputIteratorT, and CacheModifiedOutputIterator types for loading & storing arbitrary types through the cache hierarchy.  Compatible with Thrust API. 
-        - Added descending sorting to DeviceRadixSort and BlockRadixSort
-        - Added min, max, arg-min, and arg-max to DeviceReduce
-        - Added DeviceSelect (select-unique, select-if, and select-flagged)
-        - Added DevicePartition (partition-if, partition-flagged)
-        - Added generic cub::ShuffleUp(), cub::ShuffleDown(), and cub::ShuffleIndex() for warp-wide communication of arbitrary data types (SM3x+)
-        - Added cub::MaxSmOccupancy() for accurately determining SM occupancy for any given kernel function pointer
-    - Performance
-        - Improved DeviceScan and DeviceRadixSort performance for older architectures (SM10-SM30)
-    - Interface changes:
-        - Refactored block-wide I/O (BlockLoad and BlockStore), removing cache-modifiers from their interfaces.  The CacheModifiedInputIteratorTand CacheModifiedOutputIterator should now be used with BlockLoad and BlockStore to effect that behavior.
-        - Rename device-wide "stream_synchronous" param to "debug_synchronous" to avoid confusion about usage
-    - Documentation and testing:
-        - Added simple examples of device-wide methods
-        - Improved doxygen documentation and example snippets
-        - Improved test coverege to include up to 21,000 kernel variants and 851,000 unit tests (per architecture, per platform)
-    - Bug fixes 
-        - Fixed misc DeviceScan, BlockScan, DeviceReduce, and BlockReduce bugs when operating on non-primitive types for older architectures SM10-SM13
-        - Fixed DeviceScan / WarpReduction bug: SHFL-based segmented reduction producting incorrect results for multi-word types (size > 4B) on Linux 
-        - Fixed BlockScan bug: For warpscan-based scans, not all threads in the first warp were entering the prefix callback functor
-        - Fixed DeviceRadixSort bug: race condition with key-value pairs for pre-SM35 architectures
-        - Fixed DeviceRadixSort bug: incorrect bitfield-extract behavior with long keys on 64bit Linux
-        - Fixed BlockDiscontinuity bug: complation error in for types other than int32/uint32
-        - CDP (device-callable) versions of device-wide methods now report the same temporary storage allocation size requirement as their host-callable counterparts
-     
-
-//-----------------------------------------------------------------------------
-
-1.0.2    08/23/2013
-    - Corrections to code snippet examples for BlockLoad, BlockStore, and BlockDiscontinuity
-    - Cleaned up unnecessary/missing header includes.  You can now safely #inlude a specific .cuh (instead of cub.cuh)
-    - Bug/compilation fixes for BlockHistogram 
-
-//-----------------------------------------------------------------------------
-
-1.0.1    08/08/2013
-    - New collective interface idiom (specialize::construct::invoke).
-    - Added best-in-class DeviceRadixSort.  Implements short-circuiting for homogenous digit passes.
-    - Added best-in-class DeviceScan.  Implements single-pass "adaptive-lookback" strategy.
-    - Significantly improved documentation (with example code snippets) 
-    - More extensive regression test suit for aggressively testing collective variants
-    - Allow non-trially-constructed types (previously unions had prevented aliasing temporary storage of those types)
-    - Improved support for Kepler SHFL (collective ops now use SHFL for types larger than 32b)
-    - Better code generation for 64-bit addressing within BlockLoad/BlockStore
-    - DeviceHistogram now supports histograms of arbitrary bins
-    - Misc. fixes
-      - Workarounds for SM10 codegen issues in uncommonly-used WarpScan/Reduce specializations
-      - Updates to accommodate CUDA 5.5 dynamic parallelism   
-
-
-//-----------------------------------------------------------------------------
-
-0.9.4    05/07/2013
-
-    - Fixed compilation errors for SM10-SM13
-    - Fixed compilation errors for some WarpScan entrypoints on SM30+
-    - Added block-wide histogram (BlockHistogram256)
-    - Added device-wide histogram (DeviceHistogram256)
-    - Added new BlockScan algorithm variant BLOCK_SCAN_RAKING_MEMOIZE, which 
-      trades more register consumption for less shared memory I/O)
-    - Updates to BlockRadixRank to use BlockScan (which improves performance
-      on Kepler due to SHFL instruction)
-    - Allow types other than C++ primitives to be used in WarpScan::*Sum methods 
-      if they only have operator + overloaded.  (Previously they also required 
-      to support assignment from int(0).) 
-    - Update BlockReduce's BLOCK_REDUCE_WARP_REDUCTIONS algorithm to work even 
-      when block size is not an even multiple of warp size
-    - Added work management utility descriptors (GridQueue, GridEvenShare)
-    - Refactoring of DeviceAllocator interface and CachingDeviceAllocator 
-      implementation 
-    - Misc. documentation updates and corrections. 
-     
-//-----------------------------------------------------------------------------
-
-0.9.2    04/04/2013
-
-    - Added WarpReduce.  WarpReduce uses the SHFL instruction when applicable. 
-      BlockReduce now uses this WarpReduce instead of implementing its own.
-    - Misc. fixes for 64-bit Linux compilation warnings and errors.
-    - Misc. documentation updates and corrections. 
-
-//-----------------------------------------------------------------------------
-
-0.9.1    03/09/2013
-
-    - Fix for ambiguity in BlockScan::Reduce() between generic reduction and 
-      summation.  Summation entrypoints are now called ::Sum(), similar to the 
-      convention in BlockScan.
-    - Small edits to mainpage documentation and download tracking
-    
-//-----------------------------------------------------------------------------
-
-0.9.0    03/07/2013    
-
-    - Intial "preview" release.    CUB is the first durable, high-performance library 
-      of cooperative block-level, warp-level, and thread-level primitives for CUDA 
-      kernel programming.  More primitives and examples coming soon!
-    
\ No newline at end of file
diff --git a/external/cub/LICENSE.TXT b/external/cub/LICENSE.TXT
deleted file mode 100644
index db122453f9c..00000000000
--- a/external/cub/LICENSE.TXT
+++ /dev/null
@@ -1,24 +0,0 @@
-Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
-Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/external/cub/README.md b/external/cub/README.md
deleted file mode 100644
index c107d673d59..00000000000
--- a/external/cub/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-<hr>
-<h3>About CUB</h3>
-
-Current release: v1.7.4 (09/20/2017)
-
-We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
-
-CUB provides state-of-the-art, reusable software components for every layer 
-of the CUDA programming model:
-- [<b><em>Device-wide primitives</em></b>] (https://nvlabs.github.com/cub/group___device_module.html) 
-  - Sort, prefix scan, reduction, histogram, etc.  
-  - Compatible with CUDA dynamic parallelism
-- [<b><em>Block-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___block_module.html)
-  - I/O, sort, prefix scan, reduction, histogram, etc.  
-  - Compatible with arbitrary thread block sizes and types 
-- [<b><em>Warp-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___warp_module.html)
-  - Warp-wide prefix scan, reduction, etc.
-  - Safe and architecture-specific
-- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
-  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. 
-
-![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
-
-<br><hr>
-<h3>A Simple Example</h3>
-
-```C++
-#include <cub/cub.cuh>
- 
-// Block-sorting CUDA kernel
-__global__ void BlockSortKernel(int *d_in, int *d_out)
-{
-     using namespace cub;
-
-     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
-     // owning 16 integer items each
-     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
-     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
- 
-     // Allocate shared memory
-     __shared__ union {
-         typename BlockRadixSort::TempStorage  sort;
-         typename BlockLoad::TempStorage       load; 
-         typename BlockStore::TempStorage      store; 
-     } temp_storage; 
-
-     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
-
-     // Obtain a segment of 2048 consecutive keys that are blocked across threads
-     int thread_keys[16];
-     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
-     __syncthreads();
-
-     // Collectively sort the keys
-     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
-     __syncthreads();
-
-     // Store the sorted segment 
-     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
-}
-```
-
-Each thread block uses cub::BlockRadixSort to collectively sort 
-its own input segment.  The class is specialized by the 
-data type being sorted, by the number of threads per block, by the number of 
-keys per thread, and implicitly by the targeted compilation architecture.  
-
-The cub::BlockLoad and cub::BlockStore classes are similarly specialized.    
-Furthermore, to provide coalesced accesses to device memory, these primitives are 
-configured to access memory using a striped access pattern (where consecutive threads 
-simultaneously access consecutive items) and then <em>transpose</em> the keys into 
-a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads. 
-
-Once specialized, these classes expose opaque \p TempStorage member types.  
-The thread block uses these storage types to statically allocate the union of 
-shared memory needed by the thread block.  (Alternatively these storage types 
-could be aliased to global memory allocations).
-
-<br><hr>
-<h3>Stable Releases</h3>
-
-CUB releases are labeled using version identifiers having three fields: 
-*epoch.feature.update*.  The *epoch* field corresponds to support for
-a major change in the CUDA programming model.  The *feature* field 
-corresponds to a stable set of features, functionality, and interface.  The
-*update* field corresponds to a bug-fix or performance update for that
-feature set.  At the moment, we do not publicly provide non-stable releases 
-such as development snapshots, beta releases or rolling releases.  (Feel free
-to contact us if you would like such things.)  See the 
-[CUB Project Website](http://nvlabs.github.com/cub) for more information.
-
-<br><hr>
-<h3>Contributors</h3>
-
-CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com).  The primary contributor is [Duane Merrill](http://github.com/dumerrill).
-
-<br><hr>
-<h3>Open Source License</h3>
-
-CUB is available under the "New BSD" open-source license:
-
-```
-Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
-Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-```
diff --git a/external/cub/common.mk b/external/cub/common.mk
deleted file mode 100644
index 71d9880c5f5..00000000000
--- a/external/cub/common.mk
+++ /dev/null
@@ -1,233 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-
-#-------------------------------------------------------------------------------
-# Commandline Options
-#-------------------------------------------------------------------------------
-
-# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
-  
-COMMA = ,
-ifdef sm
-	SM_ARCH = $(subst $(COMMA),-,$(sm))
-else 
-    SM_ARCH = 200
-endif
-
-ifeq (700, $(findstring 700, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\" 
-    SM_DEF 		+= -DSM700
-    TEST_ARCH 	= 700
-endif
-ifeq (620, $(findstring 620, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\" 
-    SM_DEF 		+= -DSM620
-    TEST_ARCH 	= 620
-endif
-ifeq (610, $(findstring 610, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\" 
-    SM_DEF 		+= -DSM610
-    TEST_ARCH 	= 610
-endif
-ifeq (600, $(findstring 600, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\" 
-    SM_DEF 		+= -DSM600
-    TEST_ARCH 	= 600
-endif
-ifeq (520, $(findstring 520, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\" 
-    SM_DEF 		+= -DSM520
-    TEST_ARCH 	= 520
-endif
-ifeq (370, $(findstring 370, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\" 
-    SM_DEF 		+= -DSM370
-    TEST_ARCH 	= 370
-endif
-ifeq (350, $(findstring 350, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
-    SM_DEF 		+= -DSM350
-    TEST_ARCH 	= 350
-endif
-ifeq (300, $(findstring 300, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
-    SM_DEF 		+= -DSM300
-    TEST_ARCH 	= 300
-endif
-ifeq (210, $(findstring 210, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_21,compute_20\"
-    SM_DEF 		+= -DSM210
-    TEST_ARCH 	= 210
-endif
-ifeq (200, $(findstring 200, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_20,compute_20\"
-    SM_DEF 		+= -DSM200
-    TEST_ARCH 	= 200
-endif
-ifeq (130, $(findstring 130, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
-    SM_DEF 		+= -DSM130
-    TEST_ARCH 	= 130
-endif
-ifeq (120, $(findstring 120, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_12,code=\"sm_12,compute_12\" 
-    SM_DEF 		+= -DSM120
-    TEST_ARCH 	= 120
-endif
-ifeq (110, $(findstring 110, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
-    SM_DEF 		+= -DSM110
-    TEST_ARCH 	= 110
-endif
-ifeq (100, $(findstring 100, $(SM_ARCH)))
-    SM_TARGETS 	+= -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
-    SM_DEF 		+= -DSM100
-    TEST_ARCH 	= 100
-endif
-
-
-# [cdp=<0|1>] CDP enable option (default: no)
-ifeq ($(cdp), 1)
-	DEFINES += -DCUB_CDP
-	CDP_SUFFIX = cdp
-    NVCCFLAGS += -rdc=true -lcudadevrt
-else
-	CDP_SUFFIX = nocdp
-endif
-
-
-# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default) 
-ifeq ($(force32), 1)
-	CPU_ARCH = -m32
-	CPU_ARCH_SUFFIX = i386
-else
-	CPU_ARCH = -m64
-	CPU_ARCH_SUFFIX = x86_64
-    NPPI = -lnppist
-endif
-
-
-# [abi=<0|1>] CUDA ABI option (enabled by default) 
-ifneq ($(abi), 0)
-	ABI_SUFFIX = abi
-else 
-	NVCCFLAGS += -Xptxas -abi=no
-	ABI_SUFFIX = noabi
-endif
-
-
-# [open64=<0|1>] Middle-end compiler option (nvvm by default)
-ifeq ($(open64), 1)
-	NVCCFLAGS += -open64
-	PTX_SUFFIX = open64
-else 
-	PTX_SUFFIX = nvvm
-endif
-
-
-# [verbose=<0|1>] Verbose toolchain output from nvcc option
-ifeq ($(verbose), 1)
-	NVCCFLAGS += -v
-endif
-
-
-# [keep=<0|1>] Keep intermediate compilation artifacts option
-ifeq ($(keep), 1)
-	NVCCFLAGS += -keep
-endif
-
-# [debug=<0|1>] Generate debug mode code
-ifeq ($(debug), 1)
-	NVCCFLAGS += -G
-endif
-
-
-#-------------------------------------------------------------------------------
-# Compiler and compilation platform
-#-------------------------------------------------------------------------------
-
-CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
-
-NVCC = "$(shell which nvcc)"
-ifdef nvccver
-    NVCC_VERSION = $(nvccver)
-else
-    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
-endif
-
-# detect OS
-OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
-
-# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
-NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 
-
-ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
-    # For MSVC
-    # Enable more warnings and treat as errors
-    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
-    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-    NVCCFLAGS += -Xcompiler /fp:strict
-    # Help the compiler/linker work with huge numbers of kernels on Windows
-	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
-	CC = cl
-	
-	# Multithreaded runtime
-	NVCCFLAGS += -Xcompiler /MT
-	
-ifneq ($(force32), 1)
-	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
-else
-	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
-endif
-	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
-else
-    # For g++
-    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
-    NVCCFLAGS += -Xcompiler -ffloat-store
-    CC = g++
-ifneq ($(force32), 1)
-    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
-else
-    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
-endif
-endif
-
-# Suffix to append to each binary
-BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
-
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
-			$(CUB_DIR)common.mk
-		
diff --git a/external/cub/cub/agent/agent_histogram.cuh b/external/cub/cub/agent/agent_histogram.cuh
deleted file mode 100644
index 3b6cc4c92bc..00000000000
--- a/external/cub/cub/agent/agent_histogram.cuh
+++ /dev/null
@@ -1,787 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_load.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- *
- */
-enum BlockHistogramMemoryPreference
-{
-    GMEM,
-    SMEM,
-    BLEND
-};
-
-
-/**
- * Parameterizable tuning policy type for AgentHistogram
- */
-template <
-    int                             _BLOCK_THREADS,                 ///< Threads per thread block
-    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
-    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
-    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
-struct AgentHistogramPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
-        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
-        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
-        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-template <
-    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
-    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
-    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
-    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
-struct AgentHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    /// The pixel type of SampleT
-    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
-
-    /// The quad type of SampleT
-    typedef typename CubVector<SampleT, 4>::Type QuadT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
-
-        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
-        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
-
-        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
-        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
-
-        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
-
-        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
-                                        AgentHistogramPolicyT::MEM_PREFERENCE :
-                                        GMEM,
-
-        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
-    };
-
-    /// Cache load modifier for reading input elements
-    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
-
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
-            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
-        WrappedSampleIteratorT;
-
-    /// Pixel input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
-        WrappedPixelIteratorT;
-
-    /// Qaud input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
-        WrappedQuadIteratorT;
-
-    /// Parameterized BlockLoad type for samples
-    typedef BlockLoad<
-            SampleT,
-            BLOCK_THREADS,
-            SAMPLES_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadSampleT;
-
-    /// Parameterized BlockLoad type for pixels
-    typedef BlockLoad<
-            PixelT,
-            BLOCK_THREADS,
-            PIXELS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadPixelT;
-
-    /// Parameterized BlockLoad type for quads
-    typedef BlockLoad<
-            QuadT,
-            BLOCK_THREADS,
-            QUADS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadQuadT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
-
-        int tile_idx;
-
-        // Aliasable storage layout
-        union Aliasable
-        {
-            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
-            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
-
-        } aliasable;
-    };
-
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Sample input iterator (with cache modifier applied, if possible)
-    WrappedSampleIteratorT d_wrapped_samples;
-
-    /// Native pointer for input samples (possibly NULL if unavailable)
-    SampleT* d_native_samples;
-
-    /// The number of output bins for each channel
-    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// The number of privatized bins for each channel
-    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to gmem privatized histograms for each channel
-    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to final output histograms (gmem)
-    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining privatized counter indices from samples, one for each channel
-    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// Whether to prefer privatized smem counters vs privatized global counters
-    bool prefer_smem;
-
-
-    //---------------------------------------------------------------------
-    // Initialize privatized bin counters
-    //---------------------------------------------------------------------
-
-    // Initialize privatized bin counters
-    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
-            {
-                privatized_histograms[CHANNEL][privatized_bin] = 0;
-            }
-        }
-
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void InitSmemBinCounters()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        InitBinCounters(privatized_histograms);
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void InitGmemBinCounters()
-    {
-        InitBinCounters(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Update final output histograms
-    //---------------------------------------------------------------------
-
-    // Update final output histograms from privatized histograms
-    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-
-        // Apply privatized bin counts to output bin counts
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
-                    privatized_bin += BLOCK_THREADS)
-            {
-                int         output_bin  = -1;
-                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
-                bool        is_valid    = count > 0;
-
-                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
-
-                if (output_bin >= 0)
-                {
-                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
-                }
-
-            }
-        }
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void StoreSmemOutput()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        StoreOutput(privatized_histograms);
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void StoreGmemOutput()
-    {
-        StoreOutput(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile accumulation
-    //---------------------------------------------------------------------
-
-    // Accumulate pixels.  Specialized for RLE compression.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<true>      is_rle_compress)
-    {
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            // Bin pixels
-            int bins[PIXELS_PER_THREAD];
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            {
-                bins[PIXEL] = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
-            }
-
-            CounterT accumulator = 1;
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
-            {
-                if (bins[PIXEL] != bins[PIXEL + 1])
-                {
-                    if (bins[PIXEL] >= 0)
-                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
-
-                     accumulator = 0;
-                }
-                accumulator++;
-            }
-
-            // Last pixel
-            if (bins[PIXELS_PER_THREAD - 1] >= 0)
-                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
-        }
-    }
-
-
-    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<false>     is_rle_compress)
-    {
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                int bin = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
-                if (bin >= 0)
-                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
-            }
-        }
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for smem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateSmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for gmem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateGmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Tile loading
-    //---------------------------------------------------------------------
-
-    // Load full, aligned tile using pixel iterator (multi-channel)
-    template <int _NUM_ACTIVE_CHANNELS>
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples));
-    }
-
-    // Load full, aligned tile using quad iterator (single-channel)
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<1>                     num_active_channels)
-    {
-        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
-
-        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped quad iterator
-        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
-            d_wrapped_quads,
-            reinterpret_cast<AliasedQuads&>(samples));
-    }
-
-    // Load full, aligned tile
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
-    }
-
-    // Load full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        // Load using sample iterator
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples));
-    }
-
-    // Load partially-full, aligned tile using the pixel iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        int valid_pixels = valid_samples / NUM_CHANNELS;
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples),
-            valid_pixels);
-    }
-
-    // Load partially-full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples),
-            valid_samples);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    // Consume a tile of data samples
-    template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
-        bool IS_FULL_TILE>      // Whether the tile is full
-    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
-    {
-        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
-        bool        is_valid[PIXELS_PER_THREAD];
-
-        // Load tile
-        LoadTile(
-            block_offset,
-            valid_samples,
-            samples,
-            Int2Type<IS_FULL_TILE>(),
-            Int2Type<IS_ALIGNED>());
-
-        // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
-
-        // Accumulate samples
-#if CUB_PTX_ARCH >= 120
-        if (prefer_smem)
-            AccumulateSmemPixels(samples, is_valid);
-        else
-            AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
-    }
-
-
-    // Consume row tiles.  Specialized for work-stealing from queue
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<true>      is_work_stealing)
-    {
-
-        int         num_tiles                   = num_rows * tiles_per_row;
-        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
-        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
-
-        while (tile_idx < num_tiles)
-        {
-            int     row             = tile_idx / tiles_per_row;
-            int     col             = tile_idx - (row * tiles_per_row);
-            OffsetT row_offset      = row * row_stride_samples;
-            OffsetT col_offset      = (col * TILE_SAMPLES);
-            OffsetT tile_offset     = row_offset + col_offset;
-
-            if (col == tiles_per_row - 1)
-            {
-                // Consume a partially-full tile at the end of the row
-                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
-                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-            } 
-            else
-            {
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-            }
-
-            CTA_SYNC();
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
-
-            CTA_SYNC();
-
-            tile_idx = temp_storage.tile_idx;
-        }
-    }
-
-
-    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<false>     is_work_stealing)
-    {
-        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
-        {
-            OffsetT row_begin   = row * row_stride_samples;
-            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
-            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
-
-            while (tile_offset < row_end)
-            {
-                OffsetT num_remaining = row_end - tile_offset;
-
-                if (num_remaining < TILE_SAMPLES)
-                {
-                    // Consume partial tile
-                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-                    break;
-                }
-
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-                tile_offset += gridDim.x * TILE_SAMPLES;
-            }
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Parameter extraction
-    //---------------------------------------------------------------------
-
-    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
-    template <
-        CacheLoadModifier   _MODIFIER,
-        typename            _ValueT,
-        typename            _OffsetT>
-    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
-    {
-        return itr.ptr;
-    }
-
-    // Return a native pixel pointer (specialized for other types)
-    template <typename IteratorT>
-    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
-    {
-        return NULL;
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentHistogram(
-        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
-        SampleIteratorT     d_samples,                                          ///< Input data to reduce
-        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
-        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
-        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
-        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
-        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    :
-        temp_storage(temp_storage.Alias()),
-        d_wrapped_samples(d_samples),
-        num_output_bins(num_output_bins),
-        num_privatized_bins(num_privatized_bins),
-        d_output_histograms(d_output_histograms),
-        privatized_decode_op(privatized_decode_op),
-        output_decode_op(output_decode_op),
-        d_native_samples(NativePointer(d_wrapped_samples)),
-        prefer_smem((MEM_PREFERENCE == SMEM) ?
-            true :                              // prefer smem privatized histograms
-            (MEM_PREFERENCE == GMEM) ?
-                false :                         // prefer gmem privatized histograms
-                blockIdx.x & 1)                 // prefer blended privatized histograms
-    {
-        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
-
-        // Initialize the locations of this block's privatized histograms
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
-    }
-
-
-    /**
-     * Consume image
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
-        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
-        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
-        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
-
-        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
-                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
-                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
-
-        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
-                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
-                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
-
-        // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
-            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-        else
-            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-    }
-
-
-    /**
-     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void InitBinCounters()
-    {
-        if (prefer_smem)
-            InitSmemBinCounters();
-        else
-            InitGmemBinCounters();
-    }
-
-
-    /**
-     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void StoreOutput()
-    {
-        if (prefer_smem)
-            StoreSmemOutput();
-        else
-            StoreGmemOutput();
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_radix_sort_downsweep.cuh b/external/cub/cub/agent/agent_radix_sort_downsweep.cuh
deleted file mode 100644
index 0eee5f4ebf1..00000000000
--- a/external/cub/cub/agent/agent_radix_sort_downsweep.cuh
+++ /dev/null
@@ -1,772 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_radix_rank.cuh"
-#include "../block/block_exchange.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Radix ranking algorithm
- */
-enum RadixRankAlgorithm
-{
-    RADIX_RANK_BASIC,
-    RADIX_RANK_MEMOIZE,
-    RADIX_RANK_MATCH
-};
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortDownsweep
- */
-template <
-    int                         _BLOCK_THREADS,         ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
-    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
-    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
-    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
-struct AgentRadixSortDownsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
-    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
-    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-
-
-
-/**
- * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-template <
-    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
-    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,                              ///< KeyT type
-    typename ValueT,                            ///< ValueT type
-    typename OffsetT>                           ///< Signed integer type for global offsets
-struct AgentRadixSortDownsweep
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of KeyT
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
-    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
-    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
-    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
-
-    enum
-    {
-        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
-        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
-
-    // Radix ranking type to use
-    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
-            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
-            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
-                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
-            >::Type
-        >::Type BlockRadixRankT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
-    };
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadKeysT;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadValuesT;
-
-    // Value exchange array type
-    typedef ValueT ValueExchangeT[TILE_ITEMS];
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        typename BlockLoadKeysT::TempStorage    load_keys;
-        typename BlockLoadValuesT::TempStorage  load_values;
-        typename BlockRadixRankT::TempStorage   radix_rank;
-
-        struct
-        {
-            UnsignedBits                        exchange_keys[TILE_ITEMS];
-            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
-        };
-
-        Uninitialized<ValueExchangeT>           exchange_values;
-
-        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-    ValuesItr       d_values_in;
-    UnsignedBits    *d_keys_out;
-    ValueT          *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-    // Whether to short-cirucit
-    int             short_circuit;
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         valid_items)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
-            UnsignedBits digit          = BFE(key, current_bit, num_bits);
-            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
-
-            // Un-twiddle
-            key = Traits<KeyT>::TwiddleOut(key);
-
-            if (FULL_TILE || 
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        ValueT      (&values)[ITEMS_PER_THREAD],
-        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        OffsetT     valid_items)
-    {
-        CTA_SYNC();
-
-        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            exchange_values[ranks[ITEM]] = values[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
-
-            if (FULL_TILE || 
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
-            }
-        }
-    }
-
-    /**
-     * Load a tile of keys (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys, valid_items, oob_item);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
-    }
-
-
-    /**
-     * Load a tile of values (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of values (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values, valid_items);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        volatile OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        volatile OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         block_offset,
-        OffsetT         valid_items,
-        Int2Type<false> /*is_keys_only*/)
-    {
-        CTA_SYNC();
-
-        ValueT values[ITEMS_PER_THREAD];
-
-        LoadValues(
-            values,
-            block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items);
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
-        int             (&/*ranks*/)[ITEMS_PER_THREAD],
-        OffsetT         /*block_offset*/,
-        OffsetT         /*valid_items*/,
-        Int2Type<true>  /*is_keys_only*/)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        OffsetT block_offset,
-        const OffsetT &valid_items = TILE_ITEMS)
-    {
-        UnsignedBits    keys[ITEMS_PER_THREAD];
-        int             ranks[ITEMS_PER_THREAD];
-        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
-
-        // Assign default (min/max) value to all keys
-        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
-
-        // Load tile of keys
-        LoadKeys(
-            keys,
-            block_offset,
-            valid_items, 
-            default_key,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
-            keys,
-            ranks,
-            current_bit,
-            num_bits,
-            exclusive_digit_prefix);
-
-        CTA_SYNC();
-
-        // Share exclusive digit prefix
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Store exclusive prefix
-                temp_storage.exclusive_digit_prefix[bin_idx] =
-                    exclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Get inclusive digit prefix
-        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                {
-                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
-                }
-                else
-                {
-                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Update global scatter base offsets for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_offset[track] -= exclusive_digit_prefix[track];
-                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
-                bin_offset[track] += inclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
-
-        // Gather/scatter values
-        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
-    }
-
-    //---------------------------------------------------------------------
-    // Copy shortcut
-    //---------------------------------------------------------------------
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <
-        typename InputIteratorT,
-        typename T>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  d_in,
-        T               *d_out,
-        OffsetT         block_offset,
-        OffsetT         block_end)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_end)
-        {
-            OffsetT valid_items = block_end - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  /*d_in*/,
-        NullType        * /*d_out*/,
-        OffsetT         /*block_offset*/,
-        OffsetT         /*block_end*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
-        OffsetT         num_items,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            this->bin_offset[track] = bin_offset[track];
-
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Short circuit if the histogram has only bin counts of only zeros or problem-size
-                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         num_items,
-        OffsetT         *d_spine,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
-                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-
-                // Load my block's bin offset for my bin
-                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT   block_offset,
-        OffsetT   block_end)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_end);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_end);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-
-                CTA_SYNC();
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_end)
-            {
-                ProcessTile<false>(block_offset, block_end - block_offset);
-            }
-
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_radix_sort_upsweep.cuh b/external/cub/cub/agent/agent_radix_sort_upsweep.cuh
deleted file mode 100644
index 803fadf2486..00000000000
--- a/external/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ /dev/null
@@ -1,526 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_load.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../block/block_load.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortUpsweep
- */
-template <
-    int                 _BLOCK_THREADS,     ///< Threads per thread block
-    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
-    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct AgentRadixSortUpsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-template <
-    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
-    typename KeyT,                          ///< KeyT type
-    typename OffsetT>                       ///< Signed integer type for global offsets
-struct AgentRadixSortUpsweep
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
-        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            AgentRadixSortUpsweep       &cta,
-            UnsignedBits                keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
-
-        // Extract current digit bits
-        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
-
-        // Get sub-counter offset
-        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
-
-        // Get row offset
-        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
-
-        // Increment counter
-        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = LaneId();
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-        CTA_SYNC();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        OffsetT block_offset,
-        const OffsetT &block_end)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_end)
-        {
-            // Load and bucket key
-            UnsignedBits key = d_keys_in[block_offset];
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortUpsweep(
-        TempStorage &temp_storage,
-        const KeyT  *d_keys_in,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit),
-        num_bits(num_bits)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT          block_offset,
-        const OffsetT    &block_end)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_end)
-        {
-            for (int i = 0; i < UNROLL_COUNT; ++i)
-            {
-                ProcessFullTile(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            CTA_SYNC();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            CTA_SYNC();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_end);
-
-        CTA_SYNC();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-    }
-
-
-    /**
-     * Extract counts (saving them to the external array)
-     */
-    template <bool IS_DESCENDING>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT     *counters,
-        int         bin_stride = 1,
-        int         bin_offset = 0)
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-
-        // Whole blocks
-        #pragma unroll
-        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
-            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
-            BIN_BASE += BLOCK_THREADS)
-        {
-            int bin_idx = BIN_BASE + threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-
-        // Remainder
-        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
-        {
-            int bin_idx = threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-    }
-
-
-    /**
-     * Extract counts
-     */
-    template <int BINS_TRACKED_PER_THREAD>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_count[track] = 0;
-
-                #pragma unroll
-                for (int i = 0; i < WARP_THREADS; ++i)
-                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
-            }
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_reduce.cuh b/external/cub/cub/agent/agent_reduce.cuh
deleted file mode 100644
index 5528d8bdd64..00000000000
--- a/external/cub/cub/agent/agent_reduce.cuh
+++ /dev/null
@@ -1,385 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduce
- */
-template <
-    int                     _BLOCK_THREADS,         ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
-struct AgentReducePolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
-    typename InputIteratorT,           ///< Random-access iterator type for input
-    typename OutputIteratorT,          ///< Random-access iterator type for output
-    typename OffsetT,                  ///< Signed integer type for global offsets
-    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct AgentReduce
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    /// The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    /// Vector type of InputT for data movement
-    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
-                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
-                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
-
-    };
-
-    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
-
-    /// Parameterized BlockReduce primitive
-    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        typename BlockReduceT::TempStorage  reduce;
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIteratorT          d_in;               ///< Input data to reduce
-    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-
-
-    //---------------------------------------------------------------------
-    // Utility
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  /*can_vectorize*/)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-    }
-
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        /*d_in*/,
-        Int2Type<false> /*can_vectorize*/)
-    {
-        return false;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentReduce(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIteratorT          d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Tile consumption
-    //---------------------------------------------------------------------
-
-    /**
-     * Consume a full tile of input (non-vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        OutputT items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a full tile of input (vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Alias items as an array of VectorT and load it in striped fashion
-        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-        // Fabricate a vectorized input iterator
-        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
-        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
-            reinterpret_cast<VectorT*>(d_in_unqualified));
-
-        // Load items as vector items
-        InputT input_items[ITEMS_PER_THREAD];
-        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
-        #pragma unroll
-        for (int i = 0; i < WORDS; ++i)
-            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-        // Convert from input type to output type
-        OutputT items[ITEMS_PER_THREAD];
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-            items[i] = input_items[i];
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a partial tile of input
-     */
-    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     valid_items,        ///< The number of valid items in the tile
-        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Partial tile
-        int thread_offset = threadIdx.x;
-
-        // Read first item
-        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
-        {
-            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-            thread_offset += BLOCK_THREADS;
-        }
-
-        // Continue reading items (block-striped)
-        while (thread_offset < valid_items)
-        {
-            OutputT item        = d_wrapped_in[block_offset + thread_offset];
-            thread_aggregate    = reduction_op(thread_aggregate, item);
-            thread_offset       += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeRange(
-        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
-    {
-        OutputT thread_aggregate;
-
-        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // At least one full block
-        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        even_share.block_offset += even_share.block_stride;
-
-        // Consume subsequent full tiles of input
-        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
-        {
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            even_share.block_offset += even_share.block_stride;
-        }
-
-        // Consume a partially-full tile
-        if (even_share.block_offset < even_share.block_end)
-        {
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        GridEvenShare<OffsetT> even_share;
-        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
-
-        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
-    {
-        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
-        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
-
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_reduce_by_key.cuh b/external/cub/cub/agent/agent_reduce_by_key.cuh
deleted file mode 100644
index a57d60ea210..00000000000
--- a/external/cub/cub/agent/agent_reduce_by_key.cuh
+++ /dev/null
@@ -1,549 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduceByKey
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentReduceByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
-    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentReduceByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
-
-    // Tuple type for pairing keys and values
-    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-    // Guarded inequality functor
-    template <typename _EqualityOpT>
-    struct GuardedInequalityWrapper
-    {
-        _EqualityOpT     op;             ///< Wrapped equality operator
-        int             num_remaining;  ///< Items remaining
-
-        /// Constructor
-        __host__ __device__ __forceinline__
-        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
-
-        /// Boolean inequality operator, returns <tt>(a != b)</tt>
-        template <typename T>
-        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
-        {
-            if (idx < num_remaining)
-                return !op(a, b);   // In bounds
-
-            // Return true if first out-of-bounds item, false otherwise
-            return (idx == num_remaining);
-       }
-    };
-
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
-        WrappedKeysInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedValuesInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for keys
-    typedef BlockLoad<
-            KeyOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadKeysT;
-
-    // Parameterized BlockLoad type for values
-    typedef BlockLoad<
-            ValueOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadValuesT;
-
-    // Parameterized BlockDiscontinuity type for keys
-    typedef BlockDiscontinuity<
-            KeyOutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityKeys;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetValuePairT,
-            BLOCK_THREADS,
-            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Key and value exchange types
-    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
-    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadKeysT::TempStorage load_keys;
-
-        // Smem needed for loading values
-        typename BlockLoadValuesT::TempStorage load_values;
-
-        // Smem needed for compacting key value pairs(allows non POD items in this union)
-        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
-    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
-    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
-    EqualityOpT                     equality_op;        ///< KeyT equality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentReduceByKey(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        KeysInputIteratorT          d_keys_in,          ///< Input keys
-        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
-        ValuesInputIteratorT        d_values_in,        ///< Input values
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_unique_out(d_unique_out),
-        d_values_in(d_values_in),
-        d_aggregates_out(d_aggregates_out),
-        d_num_runs_out(d_num_runs_out),
-        equality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Directly scatter flagged items to output offsets
-     */
-    __device__ __forceinline__ void ScatterDirect(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
-    {
-        // Scatter flagged keys and values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
-                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
-            }
-        }
-    }
-
-
-    /**
-     * 2-phase scatter flagged items to output offsets
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate: the scatter offsets must be decremented for value aggregates
-     */
-    __device__ __forceinline__ void ScatterTwoPhase(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        CTA_SYNC();
-
-        // Compact and scatter pairs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
-        {
-            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
-            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
-            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    __device__ __forceinline__ void Scatter(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
-        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
-        {
-            ScatterTwoPhase(
-                scatter_items,
-                segment_flags,
-                segment_indices,
-                num_tile_segments,
-                num_tile_segments_prefix);
-        }
-        else
-        {
-            ScatterDirect(
-                scatter_items,
-                segment_flags,
-                segment_indices);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
-        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
-        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
-        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
-        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
-        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
-        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
-
-        // Load keys
-        if (IS_LAST_TILE)
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
-        else
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
-
-        // Load tile predecessor key in first thread
-        KeyOutputT tile_predecessor;
-        if (threadIdx.x == 0)
-        {
-            tile_predecessor = (tile_idx == 0) ?
-                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
-                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
-        }
-
-        CTA_SYNC();
-
-        // Load values
-        if (IS_LAST_TILE)
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
-        else
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
-
-        CTA_SYNC();
-
-        // Initialize head-flags and shuffle up the previous keys
-        if (IS_LAST_TILE)
-        {
-            // Use custom flag operator to additionally flag the first out-of-bounds item
-            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-        else
-        {
-            InequalityWrapper<EqualityOpT> flag_op(equality_op);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-
-        // Zip values and head flags
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scan_items[ITEM].value  = values[ITEM];
-            scan_items[ITEM].key    = head_flags[ITEM];
-        }
-
-        // Perform exclusive tile scan
-        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
-        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
-        ValueOutputT        total_aggregate;        // The tile prefix folded with block_aggregate
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
-            num_segments_prefix     = 0;
-            total_aggregate         = block_aggregate.value;
-
-            // Update tile status if there are successor tiles
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
-
-            block_aggregate         = prefix_op.GetBlockAggregate();
-            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
-            total_aggregate         = reduction_op(
-                                        prefix_op.GetExclusivePrefix().value,
-                                        block_aggregate.value);
-        }
-
-        // Rezip scatter items and segment indices
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scatter_items[ITEM].key     = prev_keys[ITEM];
-            scatter_items[ITEM].value   = scan_items[ITEM].value;
-            segment_indices[ITEM]       = scan_items[ITEM].key;
-        }
-
-        // At this point, each flagged segment head has:
-        //  - The key for the previous segment
-        //  - The reduced value from the previous segment
-        //  - The segment index for the reduced value
-
-        // Scatter flagged keys and values
-        OffsetT num_tile_segments = block_aggregate.key;
-        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
-
-        // Last thread in last tile will output final count (and last pair, if necessary)
-        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
-        {
-            OffsetT num_segments = num_segments_prefix + num_tile_segments;
-
-            // If the last tile is a whole tile, output the final_value
-            if (num_remaining == TILE_ITEMS)
-            {
-                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
-                d_aggregates_out[num_segments]  = total_aggregate;
-                num_segments++;
-            }
-
-            // Output the total number of items selected
-            *d_num_runs_out = num_segments;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_rle.cuh b/external/cub/cub/agent/agent_rle.cuh
deleted file mode 100644
index 0ba9216176c..00000000000
--- a/external/cub/cub/agent/agent_rle.cuh
+++ /dev/null
@@ -1,837 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRle
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentRlePolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
- */
-template <
-    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename    InputIteratorT,         ///< Random-access input iterator type for data
-    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
-    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
-    typename    EqualityOpT,            ///< T equality operator type
-    typename    OffsetT>                ///< Signed integer type for global offsets
-struct AgentRle
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    /// The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    /// Tuple type for scanning (pairs run-length and run-index)
-    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
-
-    /// Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
-        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
-        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
-    };
-
-
-    /**
-     * Special operator that signals all out-of-bounds items are not equal to everything else,
-     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
-     * trivial.
-     */
-    template <bool LAST_TILE>
-    struct OobInequalityOp
-    {
-        OffsetT         num_remaining;
-        EqualityOpT      equality_op;
-
-        __device__ __forceinline__ OobInequalityOp(
-            OffsetT     num_remaining,
-            EqualityOpT  equality_op)
-        :
-            num_remaining(num_remaining),
-            equality_op(equality_op)
-        {}
-
-        template <typename Index>
-        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
-        {
-            if (!LAST_TILE || (idx < num_remaining))
-                return !equality_op(first, second);
-            else
-                return true;
-        }
-    };
-
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
-            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Parameterized BlockLoad type for data
-    typedef BlockLoad<
-            T,
-            AgentRlePolicyT::BLOCK_THREADS,
-            AgentRlePolicyT::ITEMS_PER_THREAD,
-            AgentRlePolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockDiscontinuity type for data
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized WarpScan type
-    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
-
-    // Reduce-length-by-run scan operator
-    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            LengthOffsetPair,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Warp exchange types
-    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
-
-    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
-
-    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
-    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
-
-    typedef LengthOffsetPair WarpAggregates[WARPS];
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        // Aliasable storage layout
-        union Aliasable
-        {
-            struct
-            {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
-                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage                    load;
-
-            // Aliasable layout needed for two-phase scatter
-            union ScatterAliasable
-            {
-                unsigned long long                              align;
-                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
-
-            } scatter_aliasable;
-
-        } aliasable;
-
-        OffsetT             tile_idx;                   // Shared tile index
-        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
-        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-
-    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
-    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
-    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
-
-    EqualityOpT                     equality_op;        ///< T equality operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentRle(
-        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
-        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
-        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
-        EqualityOpT                 equality_op,        ///< [in] T equality operator
-        OffsetT                     num_items)          ///< [in] Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_offsets_out(d_offsets_out),
-        d_lengths_out(d_lengths_out),
-        equality_op(equality_op),
-        scan_op(cub::Sum()),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT             tile_offset,
-        OffsetT             num_remaining,
-        T                   (&items)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        bool                head_flags[ITEMS_PER_THREAD];
-        bool                tail_flags[ITEMS_PER_THREAD];
-
-        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
-
-        if (FIRST_TILE && LAST_TILE)
-        {
-            // First-and-last-tile always head-flags the first item and tail-flags the last item
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, items, inequality_op);
-        }
-        else if (FIRST_TILE)
-        {
-            // First-tile always head-flags the first item
-
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, tile_successor_item, items, inequality_op);
-        }
-        else if (LAST_TILE)
-        {
-            // Last-tile always flags the last item
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
-        }
-        else
-        {
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
-        }
-
-        // Zip counts and runs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
-            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan of allocations
-     */
-    __device__ __forceinline__ void WarpScanAllocations(
-        LengthOffsetPair    &tile_aggregate,
-        LengthOffsetPair    &warp_aggregate,
-        LengthOffsetPair    &warp_exclusive_in_tile,
-        LengthOffsetPair    &thread_exclusive_in_warp,
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        // Perform warpscans
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        LengthOffsetPair identity;
-        identity.key = 0;
-        identity.value = 0;
-
-        LengthOffsetPair thread_inclusive;
-        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
-        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
-            thread_aggregate,
-            thread_inclusive,
-            thread_exclusive_in_warp,
-            identity,
-            scan_op);
-
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
-
-        CTA_SYNC();
-
-        // Accumulate total selected and the warp-wide prefix
-        warp_exclusive_in_tile          = identity;
-        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
-        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_exclusive_in_tile = tile_aggregate;
-
-            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Two-phase scatter, specialized for warp time-slicing
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<true>      is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Locally compact items within the warp (first warp)
-        if (warp_id == 0)
-        {
-            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-        }
-
-        // Locally compact items within the warp (remaining warps)
-        #pragma unroll
-        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-            }
-        }
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Two-phase scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<false>     is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Unzip
-        OffsetT run_offsets[ITEMS_PER_THREAD];
-        LengthT run_lengths[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
-            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
-        }
-
-        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
-            run_offsets, thread_num_runs_exclusive_in_warp);
-
-        WARP_SYNC(0xffffffff);
-
-        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
-            run_lengths, thread_num_runs_exclusive_in_warp);
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = run_offsets[ITEM];
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Direct scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    thread_num_runs_exclusive_in_warp[ITEM];
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if (item_offset >= 1)
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OffsetT             tile_num_runs_aggregate,
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
-        {
-            // Direct scatter if the warp has any items
-            if (warp_num_runs_aggregate)
-            {
-                ScatterDirect<FIRST_TILE>(
-                    tile_num_runs_exclusive_in_global,
-                    warp_num_runs_aggregate,
-                    warp_num_runs_exclusive_in_tile,
-                    thread_num_runs_exclusive_in_warp,
-                    lengths_and_offsets);
-            }
-        }
-        else
-        {
-            // Scatter two phase
-            ScatterTwoPhase<FIRST_TILE>(
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets,
-                Int2Type<STORE_WARP_TIME_SLICING>());
-        }
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
-        OffsetT             num_items,          ///< Total number of global input items
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,       ///< Tile offset
-        ScanTileStateT       &tile_status)       ///< Global list of tile status
-    {
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<true, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_aggregate);
-
-            // Update thread_exclusive_in_warp to fold in warp run-length
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
-
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-
-            // Downsweep scan through lengths_and_num_runs
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = 0;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<true>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return tile_aggregate;
-        }
-        else
-        {
-            // Not first tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<false, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // First warp computes tile prefix in lane 0
-            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
-            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-            if (warp_id == 0)
-            {
-                prefix_op(tile_aggregate);
-                if (threadIdx.x == 0)
-                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
-            }
-
-            CTA_SYNC();
-
-            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
-
-            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
-            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += thread_exclusive.value;
-
-            // Downsweep scan through lengths_and_num_runs
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<false>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return prefix_op.inclusive_prefix;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_tiles,              ///< Total number of input tiles
-        ScanTileStateT&     tile_status,            ///< Global list of tile status
-        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selected
-                *d_num_runs_out = running_total.key;
-
-                // The inclusive prefix contains accumulated length reduction for the last run
-                if (running_total.key > 0)
-                    d_lengths_out[running_total.key - 1] = running_total.value;
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_scan.cuh b/external/cub/cub/agent/agent_scan.cuh
deleted file mode 100644
index 567df8049e9..00000000000
--- a/external/cub/cub/agent/agent_scan.cuh
+++ /dev/null
@@ -1,471 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentScan
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentScanPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-template <
-    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
-    typename InputIteratorT,        ///< Random-access input iterator type
-    typename OutputIteratorT,       ///< Random-access output iterator type
-    typename ScanOpT,               ///< Scan functor type
-    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
-    typename OffsetT>               ///< Signed integer type for global offsets
-struct AgentScan
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OutputT> ScanTileStateT;
-
-    // Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Constants
-    enum
-    {
-        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
-        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockStore type
-    typedef BlockStore<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::STORE_ALGORITHM>
-        BlockStoreT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OutputT,
-            ScanOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef BlockScanRunningPrefixOp<
-            OutputT,
-            ScanOpT>
-        RunningPrefixCallbackOp;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
-        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
-
-        struct
-        {
-            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
-            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
-        };
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&               temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT       d_in;               ///< Input data
-    OutputIteratorT             d_out;              ///< Output data
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    InitValueT                  init_value;         ///< The init_value element for ScanOpT
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        OutputT             init_value,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
-        block_aggregate = scan_op(init_value, block_aggregate);
-    }
-
-
-    /**
-     * Inclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        InitValueT          /*init_value*/,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * Exclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    /**
-     * Inclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentScan(
-        TempStorage&    temp_storage,       ///< Reference to temp_storage
-        InputIteratorT  d_in,               ///< Input data
-        OutputIteratorT d_out,              ///< Output data
-        ScanOpT         scan_op,            ///< Binary scan operator
-        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        init_value(init_value)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scan an sequence of consecutive tiles (independent of other thread blocks)
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool                        IS_FIRST_TILE,
-        bool                        IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT                     tile_offset,                ///< Tile offset
-        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
-        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Block scan
-        if (IS_FIRST_TILE)
-        {
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
-        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
-
-        if (range_offset + TILE_ITEMS <= range_end)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (range_offset + TILE_ITEMS <= range_end)
-            {
-                ConsumeTile<false, true>(range_offset, prefix_op);
-                range_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (range_offset < range_end)
-            {
-                int valid_items = range_end - range_offset;
-                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = range_end - range_offset;
-            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
-        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
-
-        // Consume full tiles of input
-        while (range_offset + TILE_ITEMS <= range_end)
-        {
-            ConsumeTile<true, false>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (range_offset < range_end)
-        {
-            int valid_items = range_end - range_offset;
-            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_segment_fixup.cuh b/external/cub/cub/agent/agent_segment_fixup.cuh
deleted file mode 100644
index efa6d8693ff..00000000000
--- a/external/cub/cub/agent/agent_segment_fixup.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSegmentFixup
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSegmentFixupPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentSegmentFixup
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of key-value input iterator
-    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
-
-    // Value type
-    typedef typename KeyValuePairT::Value ValueT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not do fixup using RLE + global atomics
-        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
-                                (Equals<ValueT, float>::VALUE || 
-                                 Equals<ValueT, int>::VALUE ||
-                                 Equals<ValueT, unsigned int>::VALUE ||
-                                 Equals<ValueT, unsigned long long>::VALUE),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
-        WrappedPairsInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for pairs
-    typedef BlockLoad<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
-        BlockLoadPairs;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            KeyValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadPairs::TempStorage load_pairs;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSegmentFixup(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        PairsInputIteratorT         d_pairs_in,          ///< Input keys
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_pairs_in(d_pairs_in),
-        d_aggregates_out(d_aggregates_out),
-        d_fixup_in(d_aggregates_out),
-        inequality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process input tile.  Specialized for atomic-fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        // RLE 
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
-            if (pairs[ITEM].key != pairs[ITEM - 1].key)
-                atomicAdd(d_scatter, pairs[ITEM - 1].value);
-            else
-                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
-        }
-
-        // Flush last item if valid
-        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
-        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
-            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
-    }
-
-
-    /**
-     * Process input tile.  Specialized for reduce-by-key fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        CTA_SYNC();
-
-        KeyValuePairT tile_aggregate;
-        if (tile_idx == 0)
-        {
-            // Exclusive scan of values and segment_flags
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
-
-            // Update tile status if this is not the last tile
-            if (threadIdx.x == 0)
-            {
-                // Set first segment id to not trigger a flush (invalid from exclusive scan)
-                scatter_pairs[0].key = pairs[0].key;
-
-                if (!IS_LAST_TILE)
-                    tile_state.SetInclusive(0, tile_aggregate);
-
-            }
-        }
-        else
-        {
-            // Exclusive scan of values and segment_flags
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
-            tile_aggregate = prefix_op.GetBlockAggregate();
-        }
-
-        // Scatter updated values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
-            {
-                // Update the value at the key location
-                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
-                value           = reduction_op(value, scatter_pairs[ITEM].value);
-
-                d_aggregates_out[scatter_pairs[ITEM].key] = value;
-            }
-        }
-
-        // Finalize the last item
-        if (IS_LAST_TILE)
-        {
-            // Last thread will output final count and last item, if necessary
-            if (threadIdx.x == BLOCK_THREADS - 1)
-            {
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    // Update the value at the key location
-                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
-                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        int                 num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_select_if.cuh b/external/cub/cub/agent/agent_select_if.cuh
deleted file mode 100644
index f365481915b..00000000000
--- a/external/cub/cub/agent/agent_select_if.cuh
+++ /dev/null
@@ -1,703 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSelectIf
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSelectIfPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-/**
- * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
-    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct AgentSelectIf
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        USE_SELECT_OP,
-        USE_SELECT_FLAGS,
-        USE_DISCONTINUITY,
-
-        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
-
-        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
-                                    USE_SELECT_OP :
-                                    (!Equals<FlagT, NullType>::VALUE) ?
-                                        USE_SELECT_FLAGS :
-                                        USE_DISCONTINUITY
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
-        WrappedFlagsInputIteratorT;
-
-    // Parameterized BlockLoad type for input data
-    typedef BlockLoad<
-            OutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockLoad type for flags
-    typedef BlockLoad<
-            FlagT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadFlags;
-
-    // Parameterized BlockDiscontinuity type for items
-    typedef BlockDiscontinuity<
-            OutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetT,
-            BLOCK_THREADS,
-            AgentSelectIfPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetT,
-            cub::Sum,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Item exchange type
-    typedef OutputT ItemExchangeT[TILE_ITEMS];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading items
-        typename BlockLoadT::TempStorage load_items;
-
-        // Smem needed for loading values
-        typename BlockLoadFlags::TempStorage load_flags;
-
-        // Smem needed for compacting items (allows non POD items in this union)
-        Uninitialized<ItemExchangeT> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT           d_in;               ///< Input items
-    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
-    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
-    SelectOpT                       select_op;          ///< Selection operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSelectIf(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorT              d_in,               ///< Input data
-        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,     ///< Output data
-        SelectOpT                   select_op,          ///< Selection operator
-        EqualityOpT                 equality_op,        ///< Equality operator
-        OffsetT                     num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_flags_in(d_flags_in),
-        d_selected_out(d_selected_out),
-        select_op(select_op),
-        inequality_op(equality_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize selections (specialized for selection operator)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     /*tile_offset*/,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_OP>     /*select_method*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Out-of-bounds items are selection_flags
-            selection_flags[ITEM] = 1;
-
-            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
-                selection_flags[ITEM] = select_op(items[ITEM]);
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for valid flags)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
-    {
-        CTA_SYNC();
-
-        FlagT flags[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-            // Out-of-bounds items are selection_flags
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
-        }
-        else
-        {
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
-        }
-
-        // Convert flag type to selection_flags type
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            selection_flags[ITEM] = flags[ITEM];
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for discontinuity detection)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_DISCONTINUITY> /*select_method*/)
-    {
-        if (IS_FIRST_TILE)
-        {
-            CTA_SYNC();
-
-            // Set head selection_flags.  First tile sets the first flag for the first item
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
-        }
-        else
-        {
-            OutputT tile_predecessor;
-            if (threadIdx.x == 0)
-                tile_predecessor = d_in[tile_offset - 1];
-
-            CTA_SYNC();
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
-        }
-
-        // Set selection flags for out-of-bounds items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Set selection_flags for out-of-bounds items
-            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
-                selection_flags[ITEM] = 1;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scatter flagged items to output offsets (specialized for direct scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OutputT (&items)[ITEMS_PER_THREAD],
-        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
-        OffsetT num_selections)
-    {
-        // Scatter flagged items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (selection_flags[ITEM])
-            {
-                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
-                {
-                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
-        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        // Compact and scatter items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
-            if (selection_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
-        {
-            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        int tile_num_rejections = num_tile_items - num_tile_selections;
-
-        // Scatter items to shared memory (rejections first)
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
-            int local_rejection_idx     = item_idx - local_selection_idx;
-            int local_scatter_offset    = (selection_flags[ITEM]) ?
-                                            tile_num_rejections + local_selection_idx :
-                                            local_rejection_idx;
-
-            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // Gather items from shared memory and scatter to global
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
-            int rejection_idx       = item_idx;
-            int selection_idx       = item_idx - tile_num_rejections;
-            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
-                                        num_items - num_rejected_prefix - rejection_idx - 1 :
-                                        num_selections_prefix + selection_idx;
-
-            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
-
-            if (!IS_LAST_TILE || (item_idx < num_tile_items))
-            {
-                d_selected_out[scatter_offset] = item;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        OffsetT         num_selections)                             ///< Total number of selections including this tile
-    {
-        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
-        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
-        {
-            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_tile_items,
-                num_tile_selections,
-                num_selections_prefix,
-                num_rejected_prefix,
-                Int2Type<KEEP_REJECTS>());
-        }
-        else
-        {
-            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_selections);
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<true, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of selection_flags
-        OffsetT num_tile_selections;
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
-
-        if (threadIdx.x == 0)
-        {
-            // Update tile status if this is not the last tile
-            if (!IS_LAST_TILE)
-                tile_state.SetInclusive(0, num_tile_selections);
-        }
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-            num_tile_selections -= (TILE_ITEMS - num_tile_items);
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, true>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            0,
-            0,
-            num_tile_selections);
-
-        return num_tile_selections;
-    }
-
-
-    /**
-     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<false, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of values and selection_flags
-        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
-
-        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
-        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
-        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
-        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-        {
-            int num_discount    = TILE_ITEMS - num_tile_items;
-            num_selections      -= num_discount;
-            num_tile_selections -= num_discount;
-        }
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, false>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            num_selections_prefix,
-            num_rejected_prefix,
-            num_selections);
-
-        return num_selections;
-    }
-
-
-    /**
-     * Process a tile of input
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,         ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OffsetT num_selections;
-        if (tile_idx == 0)
-        {
-            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
-        }
-        else
-        {
-            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
-        }
-
-        return num_selections;
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
-        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
-        }
-        else
-        {
-            // The last tile (possibly partially-full)
-            OffsetT num_remaining   = num_items - tile_offset;
-            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selection_flags
-                *d_num_selected_out = num_selections;
-            }
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/agent_spmv_orig.cuh b/external/cub/cub/agent/agent_spmv_orig.cuh
deleted file mode 100644
index 4e7cb609f76..00000000000
--- a/external/cub/cub/agent/agent_spmv_orig.cuh
+++ /dev/null
@@ -1,670 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_reduce.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../thread/thread_search.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/counting_input_iterator.cuh"
-#include "../iterator/tex_ref_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSpmv
- */
-template <
-    int                             _BLOCK_THREADS,                         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
-    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
-    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
-    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
-    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
-    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
-    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
-    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
-struct AgentSpmvPolicy
-{
-    enum
-    {
-        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
-        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
-        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
-    };
-
-    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
-    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
-    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
-    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
-
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-template <
-    typename        ValueT,              ///< Matrix and vector value type
-    typename        OffsetT>             ///< Signed integer type for sequence offsets
-struct SpmvParams
-{
-    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
-    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
-    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
-    ValueT          alpha;               ///< Alpha multiplicand
-    ValueT          beta;                ///< Beta addend-multiplicand
-
-    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
-};
-
-
-/**
- * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT,                    ///< Signed integer type for sequence offsets
-    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
-    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
-struct AgentSpmv
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    /// 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    /// Input iterator wrapper types (for applying cache modifiers)
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        ColumnIndicesIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        ValueIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // BlockReduce specialization
-    typedef BlockReduce<
-            ValueT,
-            BLOCK_THREADS,
-            BLOCK_REDUCE_WARP_REDUCTIONS>
-        BlockReduceT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            ValueT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockPrefixSumT;
-
-    // BlockExchange specialization
-    typedef BlockExchange<
-            ValueT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeT;
-
-    /// Merge item type (either a non-zero value or a row-end offset)
-    union MergeItem
-    {
-        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
-        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
-
-        OffsetT     row_end_offset;
-        MergeValueT nonzero;
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CoordinateT tile_coords[2];
-
-        union Aliasable
-        {
-            // Smem needed for tile of merge items
-            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
-
-            // Smem needed for block exchange
-            typename BlockExchangeT::TempStorage exchange;
-
-            // Smem needed for block-wide reduction
-            typename BlockReduceT::TempStorage reduce;
-
-            // Smem needed for tile scanning
-            typename BlockScanT::TempStorage scan;
-
-            // Smem needed for tile prefix sum
-            typename BlockPrefixSumT::TempStorage prefix_sum;
-
-        } aliasable;
-    };
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-
-    _TempStorage&                   temp_storage;         /// Reference to temp_storage
-
-    SpmvParams<ValueT, OffsetT>&    spmv_params;
-
-    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentSpmv(
-        TempStorage&                    temp_storage,           ///< Reference to temp_storage
-        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
-    :
-        temp_storage(temp_storage.Alias()),
-        spmv_params(spmv_params),
-        wd_values(spmv_params.d_values),
-        wd_row_end_offsets(spmv_params.d_row_end_offsets),
-        wd_column_indices(spmv_params.d_column_indices),
-        wd_vector_x(spmv_params.d_vector_x),
-        wd_vector_y(spmv_params.d_vector_y)
-    {}
-
-
-
-
-    /**
-     * Consume a merge tile, specialized for direct-load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-
-        ValueT          running_total = 0.0;
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
-            OffsetT column_idx          = wd_column_indices[nonzero_idx];
-            ValueT  value               = wd_values[nonzero_idx];
-
-            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-            vector_value                = wd_vector_x[column_idx];
-#endif
-            ValueT  nonzero             = value * vector_value;
-
-            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
-
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                running_total += nonzero;
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = tile_num_rows;
-                ++thread_current_coord.y;
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = thread_current_coord.x;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key   = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (tile_num_rows > 0)
-        {
-            if (threadIdx.x == 0)
-                scan_item.key = -1;
-
-            // Direct scatter
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM].key < tile_num_rows)
-                {
-                    if (scan_item.key == scan_segment[ITEM].key)
-                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
-
-                    if (HAS_ALPHA)
-                    {
-                        scan_segment[ITEM].value *= spmv_params.alpha;
-                    }
-
-                    if (HAS_BETA)
-                    {
-                        // Update the output vector element
-                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
-                        scan_segment[ITEM].value += addend;
-                    }
-
-                    // Set the output vector element
-                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
-                }
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-
-    /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-
-#if (CUB_PTX_ARCH >= 520)
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
-
-            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
-            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
-            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
-
-            if (nonzero_idx < tile_num_nonzeros)
-            {
-
-                OffsetT column_idx              = *ci;
-                ValueT  value                   = *a;
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-                vector_value                    = wd_vector_x[column_idx];
-
-                ValueT  nonzero                 = value * vector_value;
-
-                *s    = nonzero;
-            }
-        }
-
-
-#else
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        if (tile_num_nonzeros > 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
-                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
-
-                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
-                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-                vector_value                    = wd_vector_x[column_idx];
-#endif
-                ValueT  nonzero                 = value * vector_value;
-
-                s_tile_nonzeros[nonzero_idx]    = nonzero;
-            }
-        }
-
-#endif
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        #pragma unroll 1
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-        ValueT          running_total = 0.0;
-
-        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
-        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                scan_segment[ITEM].value    = nonzero;
-                running_total               += nonzero;
-                ++thread_current_coord.y;
-                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = 0.0;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
-            }
-
-            scan_segment[ITEM].key = thread_current_coord.x;
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (threadIdx.x == 0)
-        {
-            scan_item.key = thread_start_coord.x;
-            scan_item.value = 0.0;
-        }
-
-        if (tile_num_rows > 0)
-        {
-
-            CTA_SYNC();
-
-            // Scan downsweep and scatter
-            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
-
-            if (scan_item.key != scan_segment[0].key)
-            {
-                s_partials[scan_item.key] = scan_item.value;
-            }
-            else
-            {
-                scan_segment[0].value += scan_item.value;
-            }
-
-            #pragma unroll
-            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
-                {
-                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
-                }
-                else
-                {
-                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll 1
-            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
-            {
-                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-    /**
-     * Consume input tile
-     */
-    __device__ __forceinline__ void ConsumeTile(
-        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
-        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-        int             num_merge_tiles)        ///< [in] Number of merge tiles
-    {
-        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-
-        if (tile_idx >= num_merge_tiles)
-            return;
-
-        // Read our starting coordinates
-        if (threadIdx.x < 2)
-        {
-            if (d_tile_coordinates == NULL)
-            {
-                // Search our starting coordinates
-                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
-                CoordinateT                     tile_coord;
-                CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-                // Search the merge path
-                MergePathSearch(
-                    diagonal,
-                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-                    nonzero_indices,
-                    spmv_params.num_rows,
-                    spmv_params.num_nonzeros,
-                    tile_coord);
-
-                temp_storage.tile_coords[threadIdx.x] = tile_coord;
-            }
-            else
-            {
-                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
-            }
-        }
-
-        CTA_SYNC();
-
-        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
-        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
-
-        // Consume multi-segment tile
-        KeyValuePairT tile_carry = ConsumeTile(
-            tile_idx,
-            tile_start_coord,
-            tile_end_coord,
-            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
-
-        // Output the tile's carry-out
-        if (threadIdx.x == 0)
-        {
-            if (HAS_ALPHA)
-                tile_carry.value *= spmv_params.alpha;
-
-            tile_carry.key += tile_start_coord.x;
-            d_tile_carry_pairs[tile_idx]    = tile_carry;
-        }
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/agent/single_pass_scan_operators.cuh b/external/cub/cub/agent/single_pass_scan_operators.cuh
deleted file mode 100644
index 2f6713792dd..00000000000
--- a/external/cub/cub/agent/single_pass_scan_operators.cuh
+++ /dev/null
@@ -1,815 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Callback operator types for supplying BlockScan prefixes
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../util_arch.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a
- * region independent of other thread blocks
- ******************************************************************************/
-
-/**
- * Stateful callback operator type for supplying BlockScan prefixes.
- * Maintains a running prefix that can be applied to consecutive
- * BlockScan operations.
- */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOpT>            ///< Wrapped scan operator type
-struct BlockScanRunningPrefixOp
-{
-    ScanOpT     op;                 ///< Wrapped scan operator
-    T           running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOpT op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
-};
-
-
-/******************************************************************************
- * Generic tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID = 99, // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
-};
-
-
-/**
- * Tile status interface.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
-struct ScanTileState;
-
-
-/**
- * Tile status interface specialized for scan status and value types
- * that can be combined into one machine word that can be
- * read/written coherently in a single access.
- */
-template <typename T>
-struct ScanTileState<T, true>
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-
-    // Unit word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                uchar2>::Type>::Type>::Type TxnWord;
-
-
-    // Device word type
-    struct TileDescriptor
-    {
-        StatusWord  status;
-        T           value;
-    };
-
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-        TxnWord val = TxnWord();
-        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-
-/**
- * Tile status interface specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <typename T>
-struct ScanTileState<T, false>
-{
-    // Status word type
-    typedef char StatusWord;
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Device storage
-    StatusWord  *d_tile_status;
-    T           *d_tile_partial;
-    T           *d_tile_inclusive;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL),
-        d_tile_partial(NULL),
-        d_tile_inclusive(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            void*   allocations[3];
-            size_t  allocation_sizes[3];
-
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
-
-            // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Alias the offsets
-            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
-            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
-            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        // Specify storage allocation requirements
-        size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
-
-        // Set the necessary size of the blob
-        void* allocations[3];
-        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        // Update tile inclusive value
-        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        // Update tile partial value
-        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        do {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-
-            __threadfence();    // prevent hoisting loads from loop or loads below above this one
-
-        } while (status == SCAN_TILE_INVALID);
-
-        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
-            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        else
-            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
-    }
-};
-
-
-/******************************************************************************
- * ReduceByKey tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Tile status interface for reduction by key.
- *
- */
-template <
-    typename    ValueT,
-    typename    KeyT,
-    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
-struct ReduceByKeyScanTileState;
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <
-    typename    ValueT,
-    typename    KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
-    ScanTileState<KeyValuePair<KeyT, ValueT> >
-{
-    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState() : SuperClass() {}
-};
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * can be combined into one machine word that can be read/written coherently in a single access.
- */
-template <
-    typename ValueT,
-    typename KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, true>
-{
-    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
-
-    // Constants
-    enum
-    {
-        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
-        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
-        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
-
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Status word type
-    typedef typename If<(STATUS_WORD_SIZE == 8),
-        long long,
-        typename If<(STATUS_WORD_SIZE == 4),
-            int,
-            typename If<(STATUS_WORD_SIZE == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Status word type
-    typedef typename If<(TXN_WORD_SIZE == 16),
-        longlong2,
-        typename If<(TXN_WORD_SIZE == 8),
-            long long,
-            int>::Type>::Type TxnWord;
-
-    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
-    struct TileDescriptorBigStatus
-    {
-        KeyT        key;
-        ValueT      value;
-        StatusWord  status;
-    };
-
-    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
-    struct TileDescriptorLittleStatus
-    {
-        ValueT      value;
-        StatusWord  status;
-        KeyT        key;
-    };
-
-    // Device word type
-    typedef typename If<
-            (sizeof(ValueT) == sizeof(KeyT)),
-            TileDescriptorBigStatus,
-            TileDescriptorLittleStatus>::Type
-        TileDescriptor;
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
-        TxnWord         val         = TxnWord();
-        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value   = tile_inclusive.value;
-        tile_descriptor.key     = tile_inclusive.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_PARTIAL;
-        tile_descriptor.value   = tile_partial.value;
-        tile_descriptor.key     = tile_partial.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int                     tile_idx,
-        StatusWord              &status,
-        KeyValuePairT           &value)
-    {
-//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//
-//        while (tile_descriptor.status == SCAN_TILE_INVALID)
-//        {
-//            __threadfence_block(); // prevent hoisting loads from loop
-//
-//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//        }
-//
-//        status      = tile_descriptor.status;
-//        value.value = tile_descriptor.value;
-//        value.key   = tile_descriptor.key;
-
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status      = tile_descriptor.status;
-        value.value = tile_descriptor.value;
-        value.key   = tile_descriptor.key;
-    }
-
-};
-
-
-/******************************************************************************
- * Prefix call-back operator for coupling local block scan within a
- * block-cooperative scan
- ******************************************************************************/
-
-/**
- * Stateful block-scan prefix functor.  Provides the the running prefix for
- * the current tile by using the call-back warp to wait on on
- * aggregates/prefixes from predecessor tiles to become available.
- */
-template <
-    typename    T,
-    typename    ScanOpT,
-    typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct TilePrefixCallbackOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
-
-    // Temporary storage type
-    struct _TempStorage
-    {
-        typename WarpReduceT::TempStorage   warp_reduce;
-        T                                   exclusive_prefix;
-        T                                   inclusive_prefix;
-        T                                   block_aggregate;
-    };
-
-    // Alias wrapper allowing temporary storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-    // Type of status word
-    typedef typename ScanTileStateT::StatusWord StatusWord;
-
-    // Fields
-    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
-    ScanTileStateT&             tile_status;        ///< Interface to tile status
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    TilePrefixCallbackOp(
-        ScanTileStateT       &tile_status,
-        TempStorage         &temp_storage,
-        ScanOpT              scan_op,
-        int                 tile_idx)
-    :
-        temp_storage(temp_storage.Alias()),
-        tile_status(tile_status),
-        scan_op(scan_op),
-        tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the warp-wide window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
-    {
-        T value;
-        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window.
-        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
-
-        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
-            value,
-            tail_flag,
-            SwizzleScanOp<ScanOpT>(scan_op));
-    }
-
-
-    // BlockScan prefix callback functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            temp_storage.block_aggregate = block_aggregate;
-            tile_status.SetPartial(tile_idx, block_aggregate);
-        }
-
-        int         predecessor_idx = tile_idx - threadIdx.x - 1;
-        StatusWord  predecessor_status;
-        T           window_aggregate;
-
-        // Wait for the warp-wide window of predecessor tiles to become valid
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
-        {
-            predecessor_idx -= CUB_PTX_WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            tile_status.SetInclusive(tile_idx, inclusive_prefix);
-
-            temp_storage.exclusive_prefix = exclusive_prefix;
-            temp_storage.inclusive_prefix = inclusive_prefix;
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-
-    // Get the exclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetExclusivePrefix()
-    {
-        return temp_storage.exclusive_prefix;
-    }
-
-    // Get the inclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetInclusivePrefix()
-    {
-        return temp_storage.inclusive_prefix;
-    }
-
-    // Get the block aggregate stored in temporary storage
-    __device__ __forceinline__
-    T GetBlockAggregate()
-    {
-        return temp_storage.block_aggregate;
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_adjacent_difference.cuh b/external/cub/cub/block/block_adjacent_difference.cuh
deleted file mode 100644
index 1125fe59cea..00000000000
--- a/external/cub/cub/block/block_adjacent_difference.cuh
+++ /dev/null
@@ -1,596 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockAdjacentDifference
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(b, a, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(b, a);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/block/block_discontinuity.cuh b/external/cub/cub/block/block_discontinuity.cuh
deleted file mode 100644
index 428882f70ab..00000000000
--- a/external/cub/cub/block/block_discontinuity.cuh
+++ /dev/null
@@ -1,1148 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                The data type to be flagged.
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- *   that differ from their predecessors (or successors).  For example, head flags are convenient
- *   for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * \par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * \par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockDiscontinuity
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(a, b);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head & tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/block/block_exchange.cuh b/external/cub/cub/block/block_exchange.cuh
deleted file mode 100644
index c0e32fda555..00000000000
--- a/external/cub/cub/block/block_exchange.cuh
+++ /dev/null
@@ -1,1248 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the device-accessible memory subsystem prefers access patterns
- *   where data items are "striped" across threads (where consecutive threads access consecutive items),
- *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
- *   (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
- *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- */
-template <
-    typename    InputT,
-    int         BLOCK_DIM_X,
-    int         ITEMS_PER_THREAD,
-    bool        WARP_TIME_SLICING   = false,
-    int         BLOCK_DIM_Y         = 1,
-    int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
-class BlockExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
-
-        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct __align__(16) _TempStorage
-    {
-        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{BlockExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-    unsigned int lane_id;
-    unsigned int warp_id;
-    unsigned int warp_offset;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        if (warp_id == 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                temp_storage.buff[item_offset] = input_items[ITEM];
-            }
-
-            WARP_SYNC(0xffffffff);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                output_items[ITEM] = temp_storage.buff[item_offset];
-            }
-        }
-
-        #pragma unroll
-        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // No timeslicing
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        // Warp time-slicing
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Write a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage.buff[item_offset] = input_items[ITEM];
-                    }
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        #pragma unroll
-        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            CTA_SYNC();
-
-            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true> /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId()),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        lane_id(LaneId()),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to device-accessible memory.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from device-accessible memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (ranks[ITEM] >= 0)
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
-     */
-    template <typename OutputT, typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (is_valid[ITEM])
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    //@}  end member group
-
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(items, items);
-    }
-
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(items, items);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStripedGuarded(items, items, ranks);
-    }
-
-    template <typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
-    {
-        ScatterToStriped(items, items, ranks, is_valid);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-template <
-    typename    T,
-    int         ITEMS_PER_THREAD,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        // Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        T buff[WARP_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{WarpExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-public:
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpExchange(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
-            temp_storage.buff[ranks[ITEM]] = items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-};
-
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_histogram.cuh b/external/cub/cub/block/block_histogram.cuh
deleted file mode 100644
index 5d393c2353f..00000000000
--- a/external/cub/cub/block/block_histogram.cuh
+++ /dev/null
@@ -1,415 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_histogram_sort.cuh"
-#include "specializations/block_histogram_atomic.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
- */
-enum BlockHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
-};
-
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-
-/**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * \endcode
- *
- * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or device-accessible memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockHistogram
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
-        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
-        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
-
-    /// Shared memory storage layout type for BlockHistogram
-    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockHistogram}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Initialize the shared histogram counters to zero.
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <typename CounterT     >
-    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
-    {
-        // Initialize histogram bin counts to zeros
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-    }
-
-
-    /**
-     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        // Initialize histogram bin counts to zeros
-        InitHistogram(histogram);
-
-        CTA_SYNC();
-
-        // Composite the histogram
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-
-
-    /**
-     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_load.cuh b/external/cub/cub/block/block_load.cuh
deleted file mode 100644
index 234dad295a0..00000000000
--- a/external/cub/cub/block/block_load.cuh
+++ /dev/null
@@ -1,1268 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = thread_itr[ITEM];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
-        {
-            items[ITEM] = thread_itr[ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Internal implementation for load vectorization
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
-    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T      *block_ptr,                 ///< [in] Input pointer for loading from
-    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Biggest memory access word that T is a whole multiple of
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
-
-        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
-            4 :
-            (TOTAL_WORDS % 2 == 0) ?
-                2 :
-                1,
-
-        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
-
-    // Vector items
-    Vector vec_items[VECTORS_PER_THREAD];
-
-    // Aliased input ptr
-    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
-
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
-    {
-        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
-    }
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
-    }
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T   *block_ptr,                 ///< [in] Input pointer for loading from
-    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-}
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    InputIteratorT thread_itr = block_itr + linear_tid;
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    InputIteratorT thread_itr = block_itr + linear_tid;
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
-        {
-            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-
-/** @} */       // end group UtilIo
-
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-enum BlockLoadAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIteratorTis not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
-     * read efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly larger latencies than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     * - Provisions more shared storage, but incurs smaller latencies than the
-     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * of data is read directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
-     * requirement, only one warp's worth of shared memory is provisioned and is
-     * subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-};
-
-
-/**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockLoad class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockLoad can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename            InputT,
-    int                 BLOCK_DIM_X,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockLoad
-{
-private:
-
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Load helper
-    template <BlockLoadAlgorithm _POLICY, int DUMMY>
-    struct LoadInternal;
-
-
-    /**
-     * BLOCK_LOAD_DIRECT specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_VECTORIZE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <
-            CacheLoadModifier   MODIFIER,
-            typename            ValueType,
-            typename            OffsetT>
-        __device__ __forceinline__ void Load(
-            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
-        template <typename _InputIteratorT>
-        __device__ __forceinline__ void Load(
-            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
-            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            LoadDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalLoad::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-    /// \smemstorage{BlockLoad}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Load a linear segment of items from memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
-     *
-     */
-    template <typename InputIteratorT, typename DefaultT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
-    }
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_radix_rank.cuh b/external/cub/cub/block/block_radix_rank.cuh
deleted file mode 100644
index 77500ba0ede..00000000000
--- a/external/cub/cub/block/block_radix_rank.cuh
+++ /dev/null
@@ -1,697 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
- */
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_scan.cuh"
-#include "../block/block_scan.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * Blah...
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par Examples
- * \par
- * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
- *      \code
- *      #include <cub/cub.cuh>
- *
- *      template <int BLOCK_THREADS>
- *      __global__ void ExampleKernel(...)
- *      {
- *
- *      \endcode
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRank
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    // Integer type for digit counters (to be packed into words of type PackedCounters)
-    typedef unsigned short DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
-        unsigned long long,
-        unsigned int>::Type PackedCounter;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_COUNTER           = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
-        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
-
-        // The number of packed counters per thread (plus one for padding)
-        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
-        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
-    };
-
-private:
-
-
-    /// BlockScan type
-    typedef BlockScan<
-            PackedCounter,
-            BLOCK_DIM_X,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScan;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        union Aliasable
-        {
-            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-
-        } aliasable;
-
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /// Copy of raking segment, promoted to registers
-    PackedCounter cached_segment[RAKING_SEGMENT];
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal storage allocator
-     */
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Performs upsweep raking reduction, returning the aggregate
-     */
-    __device__ __forceinline__ PackedCounter Upsweep()
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-        PackedCounter *raking_ptr;
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        PackedCounter raking_partial)
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-
-        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        // Exclusive raking downsweep scan
-        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /**
-     * Reset shared memory digit counters
-     */
-    __device__ __forceinline__ void ResetCounters()
-    {
-        // Reset shared memory digit counters
-        #pragma unroll
-        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
-        {
-            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
-        }
-    }
-
-
-    /**
-     * Block-scan prefix callback
-     */
-    struct PrefixCallBack
-    {
-        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
-        {
-            PackedCounter block_prefix = 0;
-
-            // Propagate totals in packed fields
-            #pragma unroll
-            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-            {
-                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-            }
-
-            return block_prefix;
-        }
-    };
-
-
-    /**
-     * Scan shared memory digit counters.
-     */
-    __device__ __forceinline__ void ScanCounters()
-    {
-        // Upsweep scan
-        PackedCounter raking_partial = Upsweep();
-
-        // Compute exclusive sum
-        PackedCounter exclusive_partial;
-        PrefixCallBack prefix_call_back;
-        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
-
-        // Downsweep scan with exclusive partial
-        ExclusiveDownsweep(exclusive_partial);
-    }
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
-        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
-
-        // Reset shared memory digit counters
-        ResetCounters();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Get digit
-            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            // Get sub-counter
-            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
-
-            // Get counter lane
-            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
-
-            if (IS_DESCENDING)
-            {
-                sub_counter = PACKING_RATIO - 1 - sub_counter;
-                counter_lane = COUNTER_LANES - 1 - counter_lane;
-            }
-
-            // Pointer to smem digit counter
-            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[ITEM] = *digit_counters[ITEM];
-
-            // Store inclusive prefix
-            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
-        }
-
-        CTA_SYNC();
-
-        // Scan shared memory counters
-        ScanCounters();
-
-        CTA_SYNC();
-
-        // Extract the local ranks of each key
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Add in thread block exclusive prefix
-            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
-        }
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        // Rank keys
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-                // first counter column, resulting in unavoidable bank conflicts.)
-                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
-                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
-            }
-        }
-    }
-};
-
-
-
-
-
-/**
- * Radix-rank using match.any
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRankMatch
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    typedef int32_t    RankT;
-    typedef int32_t    DigitCounterT;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
-                                    WARPS + 1 :
-                                    WARPS,
-
-        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
-        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
-                                    RAKING_SEGMENT + 1 :
-                                    RAKING_SEGMENT,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, RADIX_DIGITS / BLOCK_THREADS),
-    };
-
-private:
-
-    /// BlockScan type
-    typedef BlockScan<
-            DigitCounterT,
-            BLOCK_THREADS,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScanT;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        typename BlockScanT::TempStorage            block_scan;
-
-        union __align__(16) Aliasable
-        {
-            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
-            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
-
-        } aliasable;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRankMatch(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        // Initialize shared digit counters
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
-
-        CTA_SYNC();
-
-        // Each warp will strip-mine its section of input, one strip at a time
-
-        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
-        uint32_t                lane_id         = LaneId();
-        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
-        uint32_t                lane_mask_lt    = LaneMaskLt();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // My digit
-            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            if (IS_DESCENDING)
-                digit = RADIX_DIGITS - digit - 1;
-
-            // Mask of peers who have same digit as me
-            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
-
-            // Pointer to smem digit counter for this key
-            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
-
-            // Number of occurrences in previous strips
-            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of peers having same digit as me
-            int32_t digit_count = __popc(peer_mask);
-
-            // Number of lower-ranked peers having same digit seen so far
-            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
-
-            if (peer_digit_prefix == 0)
-            {
-                // First thread for each digit updates the shared warp counter
-                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
-            }
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of prior keys having same digit
-            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
-        }
-
-        CTA_SYNC();
-
-        // Scan warp counters
-
-        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
-
-        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
-
-        CTA_SYNC();
-
-        // Seed ranks with counter values from previous warps
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-            ranks[ITEM] += *digit_counters[ITEM];
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get exclusive count for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/block/block_radix_sort.cuh b/external/cub/cub/block/block_radix_sort.cuh
deleted file mode 100644
index 736fbde746a..00000000000
--- a/external/cub/cub/block/block_radix_sort.cuh
+++ /dev/null
@@ -1,862 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
- */
-
-
-#pragma once
-
-#include "block_exchange.cuh"
-#include "block_radix_rank.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
- * \ingroup BlockModule
- *
- * \tparam KeyT                 KeyT type
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- *   <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering.
- * - \rowmajor
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockRadixSort}
- * \par
- * The code snippet below illustrates a sort of 512 integer keys that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
- *
- *     // Allocate shared memory for BlockRadixSort
- *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_keys[4];
- *     ...
- *
- *     // Collectively sort the keys
- *     BlockRadixSort(temp_storage).Sort(thread_keys);
- *
- *     ...
- * \endcode
- * \par
- * Suppose the set of input \p thread_keys across the block of threads is
- * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
- * corresponding output \p thread_keys in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename                KeyT,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    typename                ValueT                   = NullType,
-    int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixSort
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        // Whether or not there are values to be trucked along with keys
-        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // KeyT traits and unsigned bits type
-    typedef Traits<KeyT>                        KeyTraits;
-    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
-
-    /// Ascending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            false,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        AscendingBlockRadixRank;
-
-    /// Descending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            true,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        DescendingBlockRadixRank;
-
-    /// BlockExchange utility type for keys
-    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
-
-    /// BlockExchange utility type for values
-    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
-        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
-        typename BlockExchangeKeys::TempStorage        exchange_keys;
-        typename BlockExchangeValues::TempStorage      exchange_values;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-    /// Rank keys (specialized for ascending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<false> /*is_descending*/)
-    {
-        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// Rank keys (specialized for descending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<true>  /*is_descending*/)
-    {
-        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<true>  /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<false> /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for keys-only sort)
-    template <int IS_BLOCKED>
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
-        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
-        Int2Type<true>          /*is_keys_only*/,
-        Int2Type<IS_BLOCKED>    /*is_blocked*/)
-    {}
-
-    /// Sort blocked arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlocked(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Sort blocked -> striped arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
-
-                // Last pass exchanges through shared memory in striped arrangement
-                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// \smemstorage{BlockRadixSort}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-    /**
-     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_block_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_raking_layout.cuh b/external/cub/cub/block/block_raking_layout.cuh
deleted file mode 100644
index ab6b71036cd..00000000000
--- a/external/cub/cub/block/block_raking_layout.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS = BLOCK_THREADS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
-
-        /// Degree of bank conflicts (e.g., 4-way)
-        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
-            1,
-
-        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
-        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
-        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    struct __align__(16) _TempStorage
-    {
-        T buff[BlockRakingLayout::GRID_ELEMENTS];
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        // Offset for partial
-        unsigned int offset = linear_tid;
-
-        // Add in one padding element for every segment
-        if (USE_SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage.Alias().buff + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_reduce.cuh b/external/cub/cub/block/block_reduce.cuh
deleted file mode 100644
index a9de9e71742..00000000000
--- a/external/cub/cub/block/block_reduce.cuh
+++ /dev/null
@@ -1,607 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_raking_commutative_only.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA thread block.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
-     *   and is preferable when the reduction operator is commutative.  This variant
-     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
-
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators. \blocked.
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs more communication than BLOCK_REDUCE_RAKING
-     *   and is only preferable when the reduction operator is non-commutative.  This variant
-     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators.
-     *
-     * \par
-     * Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
-     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
-     *   throughput across the GPU.  However turn-around latency may be lower and
-     *   thus useful when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_WARP_REDUCTIONS,
-};
-
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - \rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Very efficient (only one synchronization barrier).
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for a 1D block of 128 threads on type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
-    int                     BLOCK_DIM_Y     = 1,
-    int                     BLOCK_DIM_Z     = 1,
-    int                     PTX_ARCH        = CUB_PTX_ARCH>
-class BlockReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
-    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
-
-    /// Internal specialization type
-    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
-        WarpReductions,
-        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
-            RakingCommutativeOnly,
-            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
-
-    /// Shared memory storage layout type for BlockReduce
-    typedef typename InternalBlockReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
-    {
-        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, reduction_op);
-        return Reduce(partial, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
-    {
-        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
-    }
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, cub::Sum());
-        return Sum(partial);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
-        }
-    }
-
-
-    //@}  end member group
-};
-
-/**
- * \example example_block_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_scan.cuh b/external/cub/cub/block/block_scan.cuh
deleted file mode 100644
index 245084cff61..00000000000
--- a/external/cub/cub/block/block_scan.cuh
+++ /dev/null
@@ -1,2126 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_scan_raking.cuh"
-#include "specializations/block_scan_warp_scans.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
- */
-enum BlockScanAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_SCAN_RAKING,
-
-
-    /**
-     * \par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
-    BLOCK_SCAN_RAKING_MEMOIZE,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
-    BLOCK_SCAN_WARP_SCANS,
-};
-
-
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - \rowmajor
- * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Invokes a minimal number of minimal block-wide synchronization barriers (only
- *   one or two depending on algorithm selection)
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - \blocksize
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for a 1D block of 128 threads on type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
-    int                 BLOCK_DIM_Y     = 1,
-    int                 BLOCK_DIM_Z     = 1,
-    int                 PTX_ARCH        = CUB_PTX_ARCH>
-class BlockScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
-     * cannot be used with thread block sizes not a multiple of the
-     * architectural warp size.
-     */
-    static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
-            BLOCK_SCAN_RAKING :
-            ALGORITHM;
-
-    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
-    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
-
-    /// Define the delegate type for the desired algorithm
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
-        WarpScans,
-        Raking>::Type InternalBlockScan;
-
-    /// Shared memory storage layout type for BlockScan
-    typedef typename InternalBlockScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-
-    //@}  end member group        // Exclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op,            ///< [in] Binary scan functor 
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op,                      ///< [in] Binary scan functor
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    //@}  end member group
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    //@}  end member group
-#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        InclusiveScan(input, output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InclusiveScan(input, output, cub::Sum(), block_aggregate);
-    }
-
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0]);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan (with no initial value)
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_block_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_shuffle.cuh b/external/cub/cub/block/block_shuffle.cuh
deleted file mode 100644
index 504f00e3552..00000000000
--- a/external/cub/cub/block/block_shuffle.cuh
+++ /dev/null
@@ -1,305 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_arch.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
- * either (a) up to their successor or (b) down to their predecessor.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockShuffle
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    enum
-    {
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T prev[BLOCK_THREADS];
-        T next[BLOCK_THREADS];
-    };
-
-
-public:
-
-    /// \smemstorage{BlockShuffle}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Shuffle movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Offset(
-        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
-        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
-        int distance = 1)           ///< [in] Offset distance (may be negative)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
-            output = temp_storage[linear_tid + distance].prev;
-    }
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Rotate(
-        T   input,                  ///< [in] The calling thread's input item
-        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
-        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        unsigned int offset = threadIdx.x + distance;
-        if (offset >= BLOCK_THREADS)
-            offset -= BLOCK_THREADS;
-
-        output = temp_storage[offset].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/block_store.cuh b/external/cub/cub/block/block_store.cuh
deleted file mode 100644
index 63039afa8e5..00000000000
--- a/external/cub/cub/block/block_store.cuh
+++ /dev/null
@@ -1,1000 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for writing linear segments of data from the CUDA thread block
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[ITEM] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
-        {
-            thread_itr[ITEM] = items[ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
- *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreDirectBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
-
-    // Alias global pointer
-    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
-
-    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    Vector raw_vector[VECTORS_PER_THREAD];
-    T *raw_items = reinterpret_cast<T*>(raw_vector);
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        raw_items[ITEM] = items[ITEM];
-    }
-
-    // Direct-store using vector types
-    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
-        {
-            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-//@}  end member group
-
-
-/** @} */       // end group UtilIo
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockStore abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
-enum BlockStoreAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIteratorT is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * To reduce the shared memory requirement, only one warp's worth of shared
-     * memory is provisioned and is subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
-};
-
-
-/**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam T                    The type of data to be written.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockStore class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockStore}
- * \par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     int thread_data[4];
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockStore
-{
-private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Store helper
-    template <BlockStoreAlgorithm _POLICY, int DUMMY>
-    struct StoreInternal;
-
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_VECTORIZE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
-        }
-
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
-            int               valid_items)                  ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef StoreInternal<ALGORITHM, 0> InternalStore;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalStore::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-
-    /// \smemstorage{BlockStore}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Store items into a linear segment of memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
-    }
-
-    /**
-     * \brief Store items into a linear segment of memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_histogram_atomic.cuh b/external/cub/cub/block/specializations/block_histogram_atomic.cuh
deleted file mode 100644
index 4599c092568..00000000000
--- a/external/cub/cub/block/specializations/block_histogram_atomic.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <int BINS>
-struct BlockHistogramAtomic
-{
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramAtomic(
-        TempStorage &temp_storage)
-    {}
-
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            T,
-        typename            CounterT,     
-        int                 ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_histogram_sort.cuh b/external/cub/cub/block/specializations/block_histogram_sort.cuh
deleted file mode 100644
index b9ad6fb79c5..00000000000
--- a/external/cub/cub/block/specializations/block_histogram_sort.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename    T,                  ///< Sample type
-    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
-    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
-    int         BINS,               ///< The number of bins into which histogram samples may fall
-    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
-struct BlockHistogramSort
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<
-            T,
-            BLOCK_DIM_X,
-            ITEMS_PER_THREAD,
-            NullType,
-            4,
-            (PTX_ARCH >= 350) ? true : false,
-            BLOCK_SCAN_WARP_SCANS,
-            cudaSharedMemBankSizeFourByte,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<
-            T,
-            BLOCK_DIM_X,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockDiscontinuityT;
-
-    /// Shared memory
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            unsigned int run_begin[BINS];
-            unsigned int run_end[BINS];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramSort(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    // Composite data onto an existing histogram
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        CTA_SYNC();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-
-        CTA_SYNC();
-
-        int flags[ITEMS_PER_THREAD];    // unused
-
-        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
-
-        CTA_SYNC();
-
-        // Composite into histogram
-        histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-
-        // Finish up with guarded composition if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_reduce_raking.cuh b/external/cub/cub/block/specializations/block_reduce_raking.cuh
deleted file mode 100644
index c2c26651796..00000000000
--- a/external/cub/cub/block/specializations/block_reduce_raking.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../block/block_raking_layout.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- *
- * Supports non-commutative binary reduction operators.  Unlike commutative
- * reduction operators (e.g., addition), the application of a non-commutative
- * reduction operator (e.g, string concatenation) across a sequence of inputs must
- * honor the relative ordering of items and partial reductions when applying the
- * reduction operator.
- *
- * Compared to the implementation of BlockReduceRaking (which does not support
- * non-commutative operators), this implementation requires a few extra
- * rounds of inter-thread communication.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRaking
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
-
-        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
-        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
-
-        /// Whether or not accesses into smem are unguarded
-        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
-
-    };
-
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<ITERATION>         /*iteration*/)
-    {
-        // Update partial if addend is in range
-        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
-        {
-            T addend = raking_segment[ITERATION];
-            partial = reduction_op(partial, addend);
-        }
-        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
-    }
-
-    template <bool IS_FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
-        T                           * /*raking_segment*/,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return partial;
-    }
-
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                IS_FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid,
-                reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
-
-                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid,
-                    reduction_op);
-
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool IS_FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum reduction_op;
-
-        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
-    }
-
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
deleted file mode 100644
index ee2294607e9..00000000000
--- a/external/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ /dev/null
@@ -1,199 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "block_reduce_raking.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRakingCommutativeOnly
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Whether or not to use fall-back
-        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
-
-        /// Number of raking threads
-        RAKING_THREADS = WARP_THREADS,
-
-        /// Number of threads actually sharing items with the raking threads
-        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
-    };
-
-    ///  WarpReduce utility type
-    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        struct
-        {
-            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
-            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
-        };
-        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
deleted file mode 100644
index 68495b4e77e..00000000000
--- a/external/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../warp/warp_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceWarpReductions
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// The logical warp size for warp reductions
-        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-
-        /// Whether or not the logical warp size evenly divides the thread block size
-        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
-    };
-
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-    unsigned int warp_id;
-    unsigned int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceWarpReductions(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
-    {
-        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
-        {
-            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
-            warp_aggregate = reduction_op(warp_aggregate, addend);
-        }
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
-    }
-
-    template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<WARPS>     /*successor_warp*/)
-    {
-        return warp_aggregate;
-    }
-
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        // Share lane aggregates
-        if (lane_id == 0)
-        {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-        }
-
-        CTA_SYNC();
-
-        // Update total aggregate in warp 0, lane 0
-        if (linear_tid == 0)
-        {
-            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
-        }
-
-        return warp_aggregate;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum        reduction_op;
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            cub::Sum());
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < static_cast<unsigned int>(num_valid)) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            reduction_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_scan_raking.cuh b/external/cub/cub/block/specializations/block_scan_raking.cuh
deleted file mode 100644
index 2e21324c9ee..00000000000
--- a/external/cub/cub/block/specializations/block_scan_raking.cuh
+++ /dev/null
@@ -1,666 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-/**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../block/block_raking_layout.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_scan.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,              ///< Data type being scanned
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanRaking
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
-        T                                           block_aggregate;    ///< Block aggregate
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    T               cached_segment[SEGMENT_LENGTH];
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /// Templated reduction
-    template <int ITERATION, typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                  raking_ptr,         ///< [in] Input array
-        ScanOp              scan_op,            ///< [in] Binary reduction operator
-        T                   raking_partial,     ///< [in] Prefix to seed reduction with
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
-        {
-            T addend = raking_ptr[ITERATION];
-            raking_partial = scan_op(raking_partial, addend);
-        }
-
-        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
-    }
-
-
-    /// Templated reduction (base case)
-    template <typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                          /*raking_ptr*/,    ///< [in] Input array
-        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
-        T                           raking_partial,    ///< [in] Prefix to seed reduction with
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return raking_partial;
-    }
-
-
-    /// Templated copy
-    template <int ITERATION>
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        out[ITERATION] = in[ITERATION];
-        CopySegment(out, in, Int2Type<ITERATION + 1>());
-    }
-
- 
-    /// Templated copy (base case)
-    __device__ __forceinline__ void CopySegment(
-        T*                  /*out*/,            ///< [out] Out array
-        T*                  /*in*/,             ///< [in] Input array
-        Int2Type<SEGMENT_LENGTH> /*iteration*/)
-    {}
-
-
-    /// Performs upsweep raking reduction, returning the aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ T Upsweep(
-        ScanOp scan_op)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data into registers
-        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-
-        T raking_partial = cached_segment[0];
-
-        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    /// Performs inclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            exclusive_output = *placement_ptr;
-        }
-    }
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial= Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-
-                // Broadcast aggregate to other threads
-                if (linear_tid == 0)
-                    temp_storage.block_aggregate = block_aggregate;
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            output = scan_op(block_prefix, output);
-            if (linear_tid == 0)
-                output = block_prefix;
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            // Update prefix with exclusive warpscan partial
-            output = scan_op(block_prefix, output);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans.cuh
deleted file mode 100644
index 9252c0a3a7f..00000000000
--- a/external/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ /dev/null
@@ -1,392 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
-
-    /// Shared memory storage layout type
-
-    struct __align__(32) _TempStorage
-    {
-        T                               warp_aggregates[WARPS];
-        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                               block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  /*addend_warp*/)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> /*addend_warp*/)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
deleted file mode 100644
index eb0a3a1b54e..00000000000
--- a/external/cub/cub/block/specializations/block_scan_warp_scans2.cuh
+++ /dev/null
@@ -1,436 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
-        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                                           warp_aggregates[WARPS];
-        T                                           block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  addend_warp)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> addend_warp)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh b/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
deleted file mode 100644
index 18bd585823a..00000000000
--- a/external/cub/cub/block/specializations/block_scan_warp_scans3.cuh
+++ /dev/null
@@ -1,418 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
-
-        /// Number of outer scan warps
-        OUTER_WARPS = INNER_WARP_THREADS
-    };
-
-    ///  Outer WarpScan utility type
-    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
-
-    ///  Inner WarpScan utility type
-    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
-
-    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union Aliasable
-        {
-            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
-            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
-
-        } aliasable;
-
-        T                               warp_aggregates[OUTER_WARPS];
-
-        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
-        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = outer_warp_exclusive;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-        {
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-        }
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        // Retrieve block aggregate
-        block_aggregate = temp_storage.block_aggregate;
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/cub.cuh b/external/cub/cub/cub.cuh
deleted file mode 100644
index b1c8e3200ab..00000000000
--- a/external/cub/cub/cub.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * CUB umbrella include file
- */
-
-#pragma once
-
-
-// Block
-#include "block/block_histogram.cuh"
-#include "block/block_discontinuity.cuh"
-#include "block/block_exchange.cuh"
-#include "block/block_load.cuh"
-#include "block/block_radix_rank.cuh"
-#include "block/block_radix_sort.cuh"
-#include "block/block_reduce.cuh"
-#include "block/block_scan.cuh"
-#include "block/block_store.cuh"
-//#include "block/block_shift.cuh"
-
-// Device
-#include "device/device_histogram.cuh"
-#include "device/device_partition.cuh"
-#include "device/device_radix_sort.cuh"
-#include "device/device_reduce.cuh"
-#include "device/device_run_length_encode.cuh"
-#include "device/device_scan.cuh"
-#include "device/device_segmented_radix_sort.cuh"
-#include "device/device_segmented_reduce.cuh"
-#include "device/device_select.cuh"
-#include "device/device_spmv.cuh"
-
-// Grid
-//#include "grid/grid_barrier.cuh"
-#include "grid/grid_even_share.cuh"
-#include "grid/grid_mapping.cuh"
-#include "grid/grid_queue.cuh"
-
-// Thread
-#include "thread/thread_load.cuh"
-#include "thread/thread_operators.cuh"
-#include "thread/thread_reduce.cuh"
-#include "thread/thread_scan.cuh"
-#include "thread/thread_store.cuh"
-
-// Warp
-#include "warp/warp_reduce.cuh"
-#include "warp/warp_scan.cuh"
-
-// Iterator
-#include "iterator/arg_index_input_iterator.cuh"
-#include "iterator/cache_modified_input_iterator.cuh"
-#include "iterator/cache_modified_output_iterator.cuh"
-#include "iterator/constant_input_iterator.cuh"
-#include "iterator/counting_input_iterator.cuh"
-#include "iterator/tex_obj_input_iterator.cuh"
-#include "iterator/tex_ref_input_iterator.cuh"
-#include "iterator/transform_input_iterator.cuh"
-
-// Util
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_device.cuh"
-#include "util_macro.cuh"
-#include "util_ptx.cuh"
-#include "util_type.cuh"
-
diff --git a/external/cub/cub/device/device_histogram.cuh b/external/cub/cub/device/device_histogram.cuh
deleted file mode 100644
index db131eee764..00000000000
--- a/external/cub/cub/device/device_histogram.cuh
+++ /dev/null
@@ -1,866 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_histogram.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par Usage Considerations
- * \cdp_class{DeviceHistogram}
- *
- */
-struct DeviceHistogram
-{
-    /******************************************************************//**
-     * \name Evenly-segmented bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage  = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_pixels;         // e.g., 5
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            lower_level,
-            upper_level,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-
-
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Custom bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of an six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
-     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1] = {d_histogram};
-        int                 num_levels1[1]  = {num_levels};
-        LevelT*             d_levels1[1]    = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ , , , , , , , ]
-     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
-     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT*             d_levels1[1]        = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int            num_pixels;       // e.g., 5
-     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
-     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
-     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int            num_levels[3];    // e.g., {5, 5, 5};
-     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [0, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            d_levels,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
-     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int              num_levels[3];      // e.g., {5, 5, 5};
-     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [2, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [1, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-
-    //@}  end member group
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_partition.cuh b/external/cub/cub/device/device_partition.cuh
deleted file mode 100644
index 154506edcc0..00000000000
--- a/external/cub/cub/device/device_partition.cuh
+++ /dev/null
@@ -1,273 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
- * a specified input sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DevicePartition}
- *
- * \par Performance
- * \linear_performance{partition}
- *
- * \par
- * The following chart illustrates DevicePartition::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected for the first partition.
- * \plots_below
- *
- * \image html partition_if_int32_50_percent.png
- *
- */
-struct DevicePartition
-{
-    /**
-     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated partition-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected for the first partition with 50% probability.
-     *
-     * \image html partition_if_int32_50_percent.png
-     * \image html partition_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability for the first partition:
-     *
-     * \image html partition_if_int32_5_percent.png
-     * \image html partition_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_partition_flagged.cu
- * \example example_device_partition_if.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_radix_sort.cuh b/external/cub/cub/device/device_radix_sort.cuh
deleted file mode 100644
index fe6cad65d7b..00000000000
--- a/external/cub/cub/device/device_radix_sort.cuh
+++ /dev/null
@@ -1,796 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
- *
- * \par Performance
- * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
- * performance across different CUDA architectures for uniform-random \p uint32 keys.
- * \plots_below
- *
- * \image html lsb_radix_sort_int32_keys.png
- *
- */
-struct DeviceRadixSort
-{
-
-    /******************************************************************//**
-     * \name KeyT-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_device_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_reduce.cuh b/external/cub/cub/device/device_reduce.cuh
deleted file mode 100644
index 3939a7ee7bf..00000000000
--- a/external/cub/cub/device/device_reduce.cuh
+++ /dev/null
@@ -1,734 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- * \par Performance
- * \linear_performance{reduction, reduce-by-key, and run-length encode}
- *
- * \par
- * The following chart illustrates DeviceReduce::Sum
- * performance across different CUDA architectures for \p int32 keys.
- *
- * \image html reduce_int32.png
- *
- * \par
- * The following chart illustrates DeviceReduce::ReduceByKey (summation)
- * performance across different CUDA architectures for \p fp32
- * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
- *
- * \image html reduce_by_key_fp32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceReduce
-{
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     __device__ __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;  // e.g., 7
-     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;     // e.g., [-]
-     * CustomMin    min_op;
-     * int          init;       // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    ReductionOpT,
-        typename                    T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
-        T                           init,                               ///< [in] Initial value of the reduction
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op,
-            init,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition (\p +) operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction.
-     * - Does not support \p + operators that are non-commutative..
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sum-reduction performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html reduce_int32.png
-     * \image html reduce_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Min(),
-            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // d_out <-- [{5, 0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // d_out <-- [9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // d_out <-- [{6, 9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
-     *
-     * \par
-     * This operation computes segmented reductions within \p d_values_in using
-     * the specified binary \p reduction_op functor.  The segments are identified by
-     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
-     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
-     * the first key of the run and the corresponding value aggregate of that run are
-     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
-     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following chart illustrates reduction-by-key (sum) performance across
-     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
-     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
-     *
-     * \image html reduce_by_key_fp32_len_500.png
-     * \image html reduce_by_key_fp64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html reduce_by_key_fp32_len_5.png
-     * \image html reduce_by_key_fp64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the segmented reduction of \p int values grouped
-     * by runs of associated \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_num_runs_out;    // e.g., [-]
-     * CustomMin    reduction_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduce-by-key
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
-     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     */
-    template <
-        typename                    KeysInputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    ValuesInputIteratorT,
-        typename                    AggregatesOutputIteratorT,
-        typename                    NumRunsOutputIteratorT,
-        typename                    ReductionOpT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t ReduceByKey(
-        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // FlagT iterator type (not used)
-
-        // Selection op (not used)
-
-        // Default == operator
-        typedef Equality EqualityOp;
-
-        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_unique_out,
-            d_values_in,
-            d_aggregates_out,
-            d_num_runs_out,
-            EqualityOp(),
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_run_length_encode.cuh b/external/cub/cub/device/device_run_length_encode.cuh
deleted file mode 100644
index ed0bf9c7d67..00000000000
--- a/external/cub/cub/device/device_run_length_encode.cuh
+++ /dev/null
@@ -1,278 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_rle.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
- * computes a simple compressed representation of a sequence of input elements such that each
- * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
- * count of the elements in that run.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRunLengthEncode}
- *
- * \par Performance
- * \linear_performance{run-length encode}
- *
- * \par
- * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
- * different CUDA architectures for \p int32 items.
- * Segments have lengths uniformly sampled from [1,1000].
- *
- * \image html rle_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceRunLengthEncode
-{
-
-    /**
-     * \brief Computes a run-length encoding of the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
-     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
-     *   respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated encode performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html rle_int32_len_500.png
-     * \image html rle_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html rle_int32_len_5.png
-     * \image html rle_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_counts_out      <-- [1, 2, 1, 3, 1]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    LengthsOutputIteratorT,
-        typename                    NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Encode(
-        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
-        typedef NullType    SelectOp;                   // Selection op (not used)
-        typedef Equality    EqualityOp;                 // Default == operator
-        typedef cub::Sum    ReductionOp;                // Value reduction operator
-
-        // The lengths output value type
-        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-            OffsetT,                                                                                                    // ... then the OffsetT type,
-            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-        // Generator type for providing 1s values for run-length reduction
-        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
-
-        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            LengthsInputIteratorT((LengthT) 1),
-            d_counts_out,
-            d_num_runs_out,
-            EqualityOp(),
-            ReductionOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
-     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
-     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     *
-     * \par Snippet
-     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // d_offsets_out         <-- [1, 4]
-     * // d_lengths_out         <-- [2, 3]
-     * // d_num_runs_out        <-- [2]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                InputIteratorT,
-        typename                OffsetsOutputIteratorT,
-        typename                LengthsOutputIteratorT,
-        typename                NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t NonTrivialRuns(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
-        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
-        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef Equality    EqualityOp;                 // Default == operator
-
-        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs_out,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_scan.cuh b/external/cub/cub/device/device_scan.cuh
deleted file mode 100644
index 4589279eeb6..00000000000
--- a/external/cub/cub/device/device_scan.cuh
+++ /dev/null
@@ -1,443 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_scan.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output sequence where each element is computed to be the reduction
- * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
- * for performing global prefix scan with only a single pass through the
- * input data, as described in our 2016 technical report [1].  The central
- * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
- * of global prefix propagation with local computation.  As such, our algorithm requires only
- * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
- * proceeds at "memcpy" speeds.
- *
- * \par
- * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
- *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
- *
- * \par Performance
- * \linear_performance{prefix scan}
- *
- * \par
- * The following chart illustrates DeviceScan::ExclusiveSum
- * performance across different CUDA architectures for \p int32 keys.
- * \plots_below
- *
- * \image html scan_int32.png
- *
- */
-struct DeviceScan
-{
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated exclusive sum performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html scan_int32.png
-     * \image html scan_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSum(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        // Initial value
-        OutputT init_value = 0;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSum(
-        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
-        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_device_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_segmented_radix_sort.cuh b/external/cub/cub/device/device_segmented_radix_sort.cuh
deleted file mode 100644
index 7f8bf8e7b3c..00000000000
--- a/external/cub/cub/device/device_segmented_radix_sort.cuh
+++ /dev/null
@@ -1,875 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedRadixSort}
- *
- */
-struct DeviceSegmentedRadixSort
-{
-
-    /******************************************************************//**
-     * \name Key-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_segmented_reduce.cuh b/external/cub/cub/device/device_segmented_reduce.cuh
deleted file mode 100644
index 1964ec1f1c4..00000000000
--- a/external/cub/cub/device/device_segmented_reduce.cuh
+++ /dev/null
@@ -1,619 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedReduce}
- *
- */
-struct DeviceSegmentedReduce
-{
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_segments;   // e.g., 3
-     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [-, -, -]
-     * CustomMin    min_op;
-     * int          initial_value;           // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT,
-        typename            ReductionOp,
-        typename            T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
-        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            reduction_op,
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p + operators that are non-commutative..
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [21, 0, 17]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Min(),
-            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [8, INT_MIN, 9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_select.cuh b/external/cub/cub/device/device_select.cuh
deleted file mode 100644
index 58bfe82ba30..00000000000
--- a/external/cub/cub/device/device_select.cuh
+++ /dev/null
@@ -1,369 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to selectively copy
- * items from a specified input sequence to a compact output sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSelect}
- *
- * \par Performance
- * \linear_performance{select-flagged, select-if, and select-unique}
- *
- * \par
- * The following chart illustrates DeviceSelect::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected.
- *
- * \image html select_if_int32_50_percent.png
- *
- * \par
- * The following chart illustrates DeviceSelect::Unique
- * performance across different CUDA architectures for \p int32 items
- * where segments have lengths uniformly sampled from [1,1000].
- *
- * \image html select_unique_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceSelect
-{
-    /**
-     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected with 50% probability.
-     *
-     * \image html select_if_int32_50_percent.png
-     * \image html select_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability:
-     *
-     * \image html select_if_int32_5_percent.png
-     * \image html select_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-unique performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html select_unique_int32_len_500.png
-     * \image html select_unique_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html select_unique_int32_len_5.png
-     * \image html select_unique_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [0, 2, 9, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Unique(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef Equality                EqualityOp;     // Default == operator
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_select_flagged.cu
- * \example example_device_select_if.cu
- * \example example_device_select_unique.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/device_spmv.cuh b/external/cub/cub/device/device_spmv.cuh
deleted file mode 100644
index 8f3a4c5cc05..00000000000
--- a/external/cub/cub/device/device_spmv.cuh
+++ /dev/null
@@ -1,174 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_spmv_orig.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
- * performs the matrix-vector operation
- * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
- * where:
- *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
- *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
- *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
- *  - <em>x</em> and <em>y</em> are dense vectors
- *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSpmv}
- *
- */
-struct DeviceSpmv
-{
-    /******************************************************************//**
-     * \name CSR matrix operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
-     *
-     * \par Snippet
-     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
-     * representing a 3x3 lattice (24 non-zeros).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
-     * // and output vector y
-     * int    num_rows = 9;
-     * int    num_cols = 9;
-     * int    num_nonzeros = 24;
-     *
-     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
-     *
-     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
-     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
-     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
-     *
-     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
-     *
-     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
-     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run SpMV
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
-     *
-     * \endcode
-     *
-     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
-     */
-    template <
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t CsrMV(
-        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
-        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
-        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
-        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
-        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        SpmvParams<ValueT, int> spmv_params;
-        spmv_params.d_values             = d_values;
-        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
-        spmv_params.d_column_indices     = d_column_indices;
-        spmv_params.d_vector_x           = d_vector_x;
-        spmv_params.d_vector_y           = d_vector_y;
-        spmv_params.num_rows             = num_rows;
-        spmv_params.num_cols             = num_cols;
-        spmv_params.num_nonzeros         = num_nonzeros;
-        spmv_params.alpha                = 1.0;
-        spmv_params.beta                 = 0.0;
-
-        return DispatchSpmv<ValueT, int>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            spmv_params,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_histogram.cuh b/external/cub/cub/device/dispatch/dispatch_histogram.cuh
deleted file mode 100644
index cdebd8b8555..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ /dev/null
@@ -1,1096 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../../agent/agent_histogram.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Histogram kernel entry points
- *****************************************************************************/
-
-/**
- * Histogram initialization kernel entry point
- */
-template <
-    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                        OffsetT>                        ///< Signed integer type for global offsets
-__global__ void DeviceHistogramInitKernel(
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
-    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    if ((threadIdx.x == 0) && (blockIdx.x == 0))
-        tile_queue.ResetDrain();
-
-    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-    #pragma unroll
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
-            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
-    }
-}
-
-
-/**
- * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
- */
-template <
-    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
-    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
-    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename                                            OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
-__global__ void DeviceHistogramSweepKernel(
-    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
-    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
-    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
-    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
-    int                                                     tiles_per_row,                      ///< Number of image tiles per row
-    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for compositing input tiles
-    typedef AgentHistogram<
-            AgentHistogramPolicyT,
-            PRIVATIZED_SMEM_BINS,
-            NUM_CHANNELS,
-            NUM_ACTIVE_CHANNELS,
-            SampleIteratorT,
-            CounterT,
-            PrivatizedDecodeOpT,
-            OutputDecodeOpT,
-            OffsetT>
-        AgentHistogramT;
-
-    // Shared memory for AgentHistogram
-    __shared__ typename AgentHistogramT::TempStorage temp_storage;
-
-    AgentHistogramT agent(
-        temp_storage,
-        d_samples,
-        num_output_bins_wrapper.array,
-        num_privatized_bins_wrapper.array,
-        d_output_histograms_wrapper.array,
-        d_privatized_histograms_wrapper.array,
-        output_decode_op_wrapper.array,
-        privatized_decode_op_wrapper.array);
-
-    // Initialize counters
-    agent.InitBinCounters();
-
-    // Consume input tiles
-    agent.ConsumeTiles(
-        num_row_pixels,
-        num_rows,
-        row_stride_samples,
-        tiles_per_row,
-        tile_queue);
-
-    // Store output to global (if necessary)
-    agent.StoreOutput();
-
-}
-
-
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
- */
-template <
-    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
-    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    LevelT,                     ///< Type for specifying bin level boundaries
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DipatchHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample value type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    enum
-    {
-        // Maximum number of bins per channel for which we will use a privatized smem strategy
-        MAX_PRIVATIZED_SMEM_BINS = 256
-    };
-
-
-    //---------------------------------------------------------------------
-    // Transform functors for converting samples to bin-ids
-    //---------------------------------------------------------------------
-
-    // Searches for bin given a list of bin-boundary levels
-    template <typename LevelIteratorT>
-    struct SearchTransform
-    {
-        LevelIteratorT  d_levels;                   // Pointer to levels array
-        int             num_output_levels;          // Number of levels in array
-
-        // Initializer
-        __host__ __device__ __forceinline__ void Init(
-            LevelIteratorT  d_levels,               // Pointer to levels array
-            int             num_output_levels)      // Number of levels in array
-        {
-            this->d_levels          = d_levels;
-            this->num_output_levels = num_output_levels;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            /// Level iterator wrapper type
-            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
-                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
-                WrappedLevelIteratorT;
-
-            WrappedLevelIteratorT wrapped_levels(d_levels);
-
-            int num_bins = num_output_levels - 1;
-            if (valid)
-            {
-                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
-                if (bin >= num_bins)
-                    bin = -1;
-            }
-        }
-    };
-
-
-    // Scales samples to evenly-spaced bins
-    struct ScaleTransform
-    {
-        int    num_bins;    // Number of levels in array
-        LevelT max;         // Max sample level (exclusive)
-        LevelT min;         // Min sample level (inclusive)
-        LevelT scale;       // Bin scaling factor
-
-        // Initializer
-        template <typename _LevelT>
-        __host__ __device__ __forceinline__ void Init(
-            int     num_output_levels,  // Number of levels in array
-            _LevelT max,                // Max sample level (exclusive)
-            _LevelT min,                // Min sample level (inclusive)
-            _LevelT scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = scale;
-        }
-
-        // Initializer (float specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            float   max,                // Max sample level (exclusive)
-            float   min,                // Min sample level (inclusive)
-            float   scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = float(1.0) / scale;
-        }
-
-        // Initializer (double specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            double max,                 // Max sample level (exclusive)
-            double min,                 // Min sample level (inclusive)
-            double scale)               // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = double(1.0) / scale;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) / scale);
-        }
-
-        // Method for converting samples to bin-ids (float specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-
-        // Method for converting samples to bin-ids (double specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-    };
-
-
-    // Pass-through bin transform operator
-    struct PassThruTransform
-    {
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            if (valid)
-                bin = (int) sample;
-        }
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    template <int NOMINAL_ITEMS_PER_THREAD>
-    struct TScale
-    {
-        enum
-        {
-            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-        };
-    };
-
-
-    /// SM11
-    struct Policy110
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                (NUM_CHANNELS == 1) ? 256 : 128,
-                (NUM_CHANNELS == 1) ? 8 : 3,
-                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM35
-    struct Policy350
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                128,
-                TScale<8>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLEND,
-                true>
-            HistogramSweepPolicy;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                384,
-                TScale<16>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t InitConfigs(
-        int             ptx_version,
-        KernelConfig    &histogram_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 500)
-        {
-            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 350)
-        {
-            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 110)
-        {
-            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
-        }
-        else
-        {
-            // No global atomic support
-            return cudaErrorNotSupported;
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration
-     */
-    struct KernelConfig
-    {
-        int                             block_threads;
-        int                             pixels_per_thread;
-
-        template <typename BlockPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t Init()
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
-
-            return cudaSuccess;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Privatization-based dispatch routine
-     */
-    template <
-        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
-        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t PrivatizedDispatch(
-        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
-        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
-        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
-        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
-        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
-        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-    #else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for histogram_sweep_kernel
-            int histogram_sweep_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                histogram_sweep_sm_occupancy,
-                histogram_sweep_kernel,
-                histogram_sweep_config.block_threads))) break;
-
-            // Get device occupancy for histogram_sweep_kernel
-            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
-
-            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
-            {
-                // Treat as a single linear array of samples
-                num_row_pixels      *= num_rows;
-                num_rows            = 1;
-                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
-            }
-
-            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
-            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
-            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
-            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
-            int blocks_per_col      = (blocks_per_row > 0) ?
-                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
-                                        0;
-            int num_thread_blocks   = blocks_per_row * blocks_per_col;
-
-            dim3 sweep_grid_dims;
-            sweep_grid_dims.x = (unsigned int) blocks_per_row;
-            sweep_grid_dims.y = (unsigned int) blocks_per_col;
-            sweep_grid_dims.z = 1;
-
-            // Temporary storage allocation requirements
-            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
-            void*       allocations[NUM_ALLOCATIONS];
-            size_t      allocation_sizes[NUM_ALLOCATIONS];
-
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
-
-            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the grid queue descriptor
-            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
-
-            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
-
-            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
-
-            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
-
-            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
-
-            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
-
-            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
-
-            int histogram_init_block_threads    = 256;
-            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
-
-            // Log DeviceHistogramInitKernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
-                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
-
-            // Invoke histogram_init_kernel
-            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
-                num_output_bins_wrapper,
-                d_output_histograms_wrapper,
-                tile_queue);
-
-            // Return if empty problem
-            if ((blocks_per_row == 0) || (blocks_per_col == 0))
-                break;
-
-            // Log histogram_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
-                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
-                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
-
-            // Invoke histogram_sweep_kernel
-            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
-                d_samples,
-                num_output_bins_wrapper,
-                num_privatized_bins_wrapper,
-                d_output_histograms_wrapper,
-                d_privatized_histograms_wrapper,
-                output_decode_op_wrapper,
-                privatized_decode_op_wrapper,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                tiles_per_row,
-                tile_queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-    #endif // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the search transform op for converting samples to privatized bins
-            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            // Dispatch
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Too many bins to keep in shared memory.
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the search transform op for converting privatized bins to output bins
-            typedef SearchTransform<LevelT*> OutputDecodeOpT;
-
-            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the scale transform op for converting samples to privatized bins
-            typedef ScaleTransform PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-
-                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the scale transform op for converting privatized bins to output bins
-            typedef ScaleTransform OutputDecodeOpT;
-
-            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
deleted file mode 100644
index f9793ebd53e..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ /dev/null
@@ -1,1652 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_radix_sort_upsweep.cuh"
-#include "../../agent/agent_radix_sort_downsweep.cuh"
-#include "../../agent/agent_scan.cuh"
-#include "../../block/block_radix_sort.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortUpsweepKernel(
-    const KeyT              *d_keys,                        ///< [in] Input keys buffer
-    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    enum {
-        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
-                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
-    };
-
-    // Parameterize AgentRadixSortUpsweep type for the current configuration
-    typedef AgentRadixSortUpsweep<
-            typename If<(ALT_DIGIT_BITS),
-                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
-            KeyT,
-            OffsetT>
-        AgentRadixSortUpsweepT;
-
-    // Shared memory storage
-    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
-
-    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
-
-    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
-
-    CTA_SYNC();
-
-    // Write out digit counts (striped)
-    upsweep.ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
-}
-
-
-/**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortScanBinsKernel(
-    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int                     num_counts)                     ///< [in] Total number of bin-counts
-{
-    // Parameterize the AgentScan type for the current configuration
-    typedef AgentScan<
-            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
-            OffsetT*,
-            OffsetT*,
-            cub::Sum,
-            OffsetT,
-            OffsetT>
-        AgentScanT;
-
-    // Shared memory storage
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Block scan instance
-    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
-
-    // Process full input tiles
-    int block_offset = 0;
-    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
-    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
-    {
-        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
-        block_offset += AgentScanT::TILE_ITEMS;
-    }
-}
-
-
-/**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortDownsweepKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    enum {
-        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
-                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
-    };
-
-    // Parameterize AgentRadixSortDownsweep type for the current configuration
-    typedef AgentRadixSortDownsweep<
-            typename If<(ALT_DIGIT_BITS),
-                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
-            IS_DESCENDING,
-            KeyT,
-            ValueT,
-            OffsetT>
-        AgentRadixSortDownsweepT;
-
-    // Shared memory storage
-    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    // Process input tiles
-    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end);
-}
-
-
-/**
- * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceRadixSortSingleTileKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-{
-    // Constants
-    enum
-    {
-        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // BlockRadixSort type
-    typedef BlockRadixSort<
-            KeyT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            ValueT,
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
-            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
-        BlockRadixSortT;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        KeyT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
-
-    // Unsigned word for key bits
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
-
-    // Shared memory storage
-    __shared__ union TempStorage
-    {
-        typename BlockRadixSortT::TempStorage       sort;
-        typename BlockLoadKeys::TempStorage         load_keys;
-        typename BlockLoadValues::TempStorage       load_values;
-
-    } temp_storage;
-
-    // Keys and values for the block
-    KeyT            keys[ITEMS_PER_THREAD];
-    ValueT          values[ITEMS_PER_THREAD];
-
-    // Get default (min/max) value for out-of-bounds keys
-    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
-    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
-
-    // Load keys
-    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
-
-    CTA_SYNC();
-
-    // Load values
-    if (!KEYS_ONLY)
-    {
-        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
-
-        CTA_SYNC();
-    }
-
-    // Sort tile
-    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
-        keys,
-        values,
-        current_bit,
-        end_bit,
-        Int2Type<IS_DESCENDING>(),
-        Int2Type<KEYS_ONLY>());
-
-    // Store keys and values
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
-        if (item_offset < num_items)
-        {
-            d_keys_out[item_offset] = keys[ITEM];
-            if (!KEYS_ONLY)
-                d_values_out[item_offset] = values[ITEM];
-        }
-    }
-}
-
-
-/**
- * Segmented radix sorting pass (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedRadixSortKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
-{
-    //
-    // Constants
-    //
-
-    typedef typename If<(ALT_DIGIT_BITS),
-        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
-        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
-
-    enum
-    {
-        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
-        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        RADIX_DIGITS        = 1 << RADIX_BITS,
-        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Upsweep type
-    typedef AgentRadixSortUpsweep<
-            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,
-            KeyT,
-            OffsetT>
-        BlockUpsweepT;
-
-    // Digit-scan type
-    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
-
-    // Downsweep type
-    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
-    };
-
-    //
-    // Process input tiles
-    //
-
-    // Shared memory storage
-    __shared__ union
-    {
-        typename BlockUpsweepT::TempStorage     upsweep;
-        typename BlockDownsweepT::TempStorage   downsweep;
-        struct
-        {
-            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
-            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
-            typename DigitScanT::TempStorage        scan;
-        };
-
-    } temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-    OffsetT num_items       = segment_end - segment_begin;
-
-    // Check if empty segment
-    if (num_items <= 0)
-        return;
-
-    // Upsweep
-    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
-    upsweep.ProcessRegion(segment_begin, segment_end);
-
-    CTA_SYNC();
-
-    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
-    upsweep.ExtractCounts(bin_count);
-
-    CTA_SYNC();
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin counts
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    // Scan
-    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
-
-    #pragma unroll
-    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-    {
-        bin_offset[track] += segment_begin;
-    }
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin offsets
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    CTA_SYNC();
-
-    // Downsweep
-    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
-    downsweep.ProcessRegion(segment_begin, segment_end);
-}
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-/**
- * Tuning policy for kernel specialization
- */
-template <
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
-struct DeviceRadixSortPolicy
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-
-        // Relative size of KeyT type to a 4-byte word
-        SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
-    };
-
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 6,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
-        };
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128,   CUB_MAX(1, 9 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR_4B), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-
-
-    };
-
-
-    /// SM50
-    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 7,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 3.1B 32b segmented keys/s (TitanX)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <160, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
-    };
-
-
-    /// SM60 (GP100)
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 7,    // 6.9B 32b keys/s (Quadro P100)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 5.9B 32b segmented keys/s (Quadro P100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 25 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-
-    };
-
-
-    /// SM61 (GP104)
-    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 7,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 3.3B 32b segmented keys/s (1080)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 31 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT,       LOAD_DEFAULT,       RADIX_RANK_MATCH,   BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 35 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE,    LOAD_DEFAULT,   RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 16 / SCALE_FACTOR_4B), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// SM62 (Tegra, less RF)
-    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 16 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM70 (GV100)
-    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 6,    // 7.62B 32b keys/s (GV100)
-            SINGLE_TILE_RADIX_BITS  = 6,
-            SEGMENTED_RADIX_BITS    = 6,    // 8.7B 32b segmented keys/s (GV100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 47 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  UpsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 29 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>  AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <256, CUB_MAX(1, 19 / SCALE_FACTOR_4B),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <192, CUB_MAX(1, 39 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <384, CUB_MAX(1, 11 / SCALE_FACTOR_4B),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy700 MaxPolicy;
-
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
-struct DispatchRadixSort :
-    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version),
-        is_overwrite_okay(is_overwrite_okay)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block to sort in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Log single_tile_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
-                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_keys.Current(),
-                d_keys.Alternate(),
-                d_values.Current(),
-                d_values.Alternate(),
-                num_items,
-                begin_bit,
-                end_bit);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update selector
-            d_keys.selector ^= 1;
-            d_values.selector ^= 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation
-    //------------------------------------------------------------------------------
-
-    /**
-     * Invoke a three-kernel sorting pass at the current bit.
-     */
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        OffsetT         *d_spine,
-        int             spine_length,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log upsweep_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
-                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
-                d_keys_in,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log scan_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
-
-            // Invoke scan_kernel
-            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
-                d_spine,
-                spine_length);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log downsweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
-                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
-
-            // Invoke downsweep_kernel
-            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
-                d_keys_in,
-                d_keys_out,
-                d_values_in,
-                d_values_out,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-
-    /// Pass configuration structure
-    template <
-        typename UpsweepKernelT,
-        typename ScanKernelT,
-        typename DownsweepKernelT>
-    struct PassConfig
-    {
-        UpsweepKernelT          upsweep_kernel;
-        KernelConfig            upsweep_config;
-        ScanKernelT             scan_kernel;
-        KernelConfig            scan_config;
-        DownsweepKernelT        downsweep_kernel;
-        KernelConfig            downsweep_config;
-        int                     radix_bits;
-        int                     radix_digits;
-        int                     max_downsweep_grid_size;
-        GridEvenShare<OffsetT>  even_share;
-
-        /// Initialize pass configuration
-        template <
-            typename UpsweepPolicyT,
-            typename ScanPolicyT,
-            typename DownsweepPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(
-            UpsweepKernelT      upsweep_kernel,
-            ScanKernelT         scan_kernel,
-            DownsweepKernelT    downsweep_kernel,
-            int                 ptx_version,
-            int                 sm_count,
-            int                 num_items)
-        {
-            cudaError error = cudaSuccess;
-            do
-            {
-                this->upsweep_kernel    = upsweep_kernel;
-                this->scan_kernel       = scan_kernel;
-                this->downsweep_kernel  = downsweep_kernel;
-                radix_bits              = DownsweepPolicyT::RADIX_BITS;
-                radix_digits            = 1 << radix_bits;
-
-                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
-                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
-                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
-
-                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-
-                even_share.DispatchInit(
-                    num_items,
-                    max_downsweep_grid_size,
-                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
-
-            }
-            while (0);
-            return error;
-        }
-
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
-        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)upsweep_kernel;
-        (void)alt_upsweep_kernel;
-        (void)scan_kernel;
-        (void)downsweep_kernel;
-        (void)alt_downsweep_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular and alternate-digit kernel configurations
-            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template InitPassConfig<
-                    typename ActivePolicyT::UpsweepPolicy, 
-                    typename ActivePolicyT::ScanPolicy, 
-                    typename ActivePolicyT::DownsweepPolicy>(
-                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            if ((error = alt_pass_config.template InitPassConfig<
-                    typename ActivePolicyT::AltUpsweepPolicy, 
-                    typename ActivePolicyT::ScanPolicy, 
-                    typename ActivePolicyT::AltDownsweepPolicy>(
-                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            // Get maximum spine length
-            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
-            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[3];
-            size_t allocation_sizes[3] =
-            {
-                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
-
-            // Alias the temporary storage allocations
-            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                d_spine, spine_length, current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_spine, spine_length, current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
-
-                // Invert selectors
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
-                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        OffsetT                 num_items,              ///< [in] Number of items to sort
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,              ///< Key type
-    typename ValueT,            ///< Value type
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT>           ///< Signed integer type for global offsets
-struct DispatchSegmentedRadixSort :
-    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Parameter members
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructors
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        is_overwrite_okay(is_overwrite_okay),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Multi-segment invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a three-kernel sorting pass at the current bit.
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
-                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
-
-            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
-                d_keys_in, d_keys_out,
-                d_values_in,  d_values_out,
-                d_begin_offsets, d_end_offsets, num_segments,
-                current_bit, pass_bits);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /// PassConfig data structure
-    template <typename SegmentedKernelT>
-    struct PassConfig
-    {
-        SegmentedKernelT    segmented_kernel;
-        KernelConfig        segmented_config;
-        int                 radix_bits;
-        int                 radix_digits;
-
-        /// Initialize pass configuration
-        template <typename SegmentedPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
-        {
-            this->segmented_kernel  = segmented_kernel;
-            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
-            this->radix_digits      = 1 << radix_bits;
-
-            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
-        }
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-      (void)segmented_kernel;
-      (void)alt_segmented_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Init regular and alternate kernel configurations
-            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
-            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                if (temp_storage_bytes == 0)
-                    temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
-            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-                // Invert selectors and update current bit
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-
-    /// Internal dispatch routine
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,              ///< [in] Number of items to sort
-        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, num_segments, d_begin_offsets, d_end_offsets,
-                begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_reduce.cuh b/external/cub/cub/device/dispatch/dispatch_reduce.cuh
deleted file mode 100644
index b6aa44cc0e5..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ /dev/null
@@ -1,882 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_reduce.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
-
-    // Output result
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = block_aggregate;
-}
-
-
-/**
- * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OuputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceReduceSingleTileKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OuputT                  init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Check if empty problem
-    if (num_items == 0)
-    {
-        if (threadIdx.x == 0)
-            *d_out = init;
-        return;
-    }
-
-    // Consume input tiles
-    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        OffsetT(0),
-        num_items);
-
-    // Output result
-    if (threadIdx.x == 0)
-        *d_out = reduction_op(init, block_aggregate);
-}
-
-
-/// Normalize input iterator to segment offset
-template <typename T, typename OffsetT, typename IteratorT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    T &/*val*/,
-    OffsetT /*base_offset*/,
-    IteratorT /*itr*/)
-{}
-
-
-/// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    KeyValuePairT &val,
-    OffsetT base_offset,
-    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
-{
-    val.key -= base_offset;
-}
-
-
-/**
- * Segmented reduction (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor 
-    OutputT                 init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-
-    // Check if empty problem
-    if (segment_begin == segment_end)
-    {
-        if (threadIdx.x == 0)
-            d_out[blockIdx.x] = init;
-        return;
-    }
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        segment_begin,
-        segment_end);
-
-    // Normalize as needed
-    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
-
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
-}
-
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <
-    typename OuputT,            ///< Data type
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-struct DeviceReducePolicy
-{
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
-    {
-        // ReducePolicy
-        typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(128, 8, OuputT), ///< Threads per block, items per thread
-                2,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                       ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
-    {
-        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(128, 8, OuputT),     ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
-        typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
-                2,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                           ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 20, OuputT),    ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-    /// SM60
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-    {
-        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-        typedef AgentReducePolicy<
-                CUB_NOMINAL_CONFIG(256, 16, OuputT),    ///< Threads per block, items per thread
-                4,                                      ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy600 MaxPolicy;
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-struct DispatchReduce :
-    DeviceReducePolicy<
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
-        OffsetT,
-        ReductionOpT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    // Data type of output iterator
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
-    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
-    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
-    OutputT             init;                           ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;                    ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_items,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_items(num_items),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block block to reduce in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke single_reduce_sweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation (two-pass)
-    //------------------------------------------------------------------------------
-
-    /// Invoke two-passes to reduce
-    template <
-        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
-        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
-        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)                  reduce_kernel;
-        (void)                  single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular kernel configuration
-            KernelConfig reduce_config;
-            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
-            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-            GridEvenShare<OffsetT> even_share;
-            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
-
-            // Temporary storage allocation requirements
-            void* allocations[1];
-            size_t allocation_sizes[1] =
-            {
-                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocation for the privatized per-block reductions
-            OutputT *d_block_reductions = (OutputT*) allocations[0];
-
-            // Get grid size for device_reduce_sweep_kernel
-            int reduce_grid_size = even_share.grid_size;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                reduce_grid_size,
-                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
-                reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_block_reductions,
-                num_items,
-                even_share,
-                reduction_op);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke DeviceReduceSingleTileKernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_block_reductions,
-                d_out,
-                reduce_grid_size,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
-                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out, num_items, reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-struct DispatchSegmentedReduce :
-    DeviceReducePolicy<
-        typename std::iterator_traits<InputIteratorT>::value_type,
-        OffsetT,
-        ReductionOpT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    /// The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
-    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
-    OutputT             init;                   ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;            ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <
-        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
-        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)segmented_reduce_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Init kernel configuration
-            KernelConfig segmented_reduce_config;
-            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                num_segments,
-                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
-                segmented_reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_out,
-                d_begin_offsets,
-                d_end_offsets,
-                num_segments,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        if (num_segments <= 0)
-            return cudaSuccess;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out,
-                num_segments, d_begin_offsets, d_end_offsets,
-                reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
deleted file mode 100644
index 672bc49393a..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ /dev/null
@@ -1,554 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_reduce_by_key.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
-    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
-    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
-    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
-    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
-    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
-    typename            ScanTileStateT,                         ///< Tile status interface type
-    typename            EqualityOpT,                            ///< KeyT equality operator type
-    typename            ReductionOpT,                           ///< ValueT reduction operator type
-    typename            OffsetT>                                ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
-__global__ void DeviceReduceByKeyKernel(
-    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
-    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
-    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
-    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
-    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-    ScanTileStateT              tile_state,                     ///< Tile status interface
-    int                         start_tile,                     ///< The starting tile for the current grid
-    EqualityOpT                 equality_op,                    ///< KeyT equality operator
-    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
-    OffsetT                     num_items)                      ///< Total number of items to select from
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentReduceByKey<
-            AgentReduceByKeyPolicyT,
-            KeysInputIteratorT,
-            UniqueOutputIteratorT,
-            ValuesInputIteratorT,
-            AggregatesOutputIteratorT,
-            NumRunsOutputIteratorT,
-            EqualityOpT,
-            ReductionOpT,
-            OffsetT>
-        AgentReduceByKeyT;
-
-    // Shared memory for AgentReduceByKey
-    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
- */
-template <
-    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
-    typename    EqualityOpT,                ///< KeyT equality operator type
-    typename    ReductionOpT,               ///< ValueT reduction operator type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchReduceByKey
-{
-    //-------------------------------------------------------------------------
-    // Types and constants
-    //-------------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS     = 128,
-        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
-        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-
-    //-------------------------------------------------------------------------
-    // Tuning policies
-    //-------------------------------------------------------------------------
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 11,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM11
-    struct Policy110
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            ReduceByKeyPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &reduce_by_key_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
-        }
-        else
-        {
-            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduce-by-key using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
-        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                  ///< [in] Total number of items to select from
-        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
-        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
-        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-      (void)d_temp_storage;
-      (void)temp_storage_bytes;
-      (void)d_keys_in;
-      (void)d_unique_out;
-      (void)d_values_in;
-      (void)d_aggregates_out;
-      (void)d_num_runs_out;
-      (void)equality_op;
-      (void)reduction_op;
-      (void)num_items;
-      (void)stream;
-      (void)debug_synchronous;
-      (void)init_kernel;
-      (void)reduce_by_key_kernel;
-      (void)reduce_by_key_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_state,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for reduce_by_key_kernel
-            int reduce_by_key_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                reduce_by_key_sm_occupancy,            // out
-                reduce_by_key_kernel,
-                reduce_by_key_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log reduce_by_key_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
-
-                // Invoke reduce_by_key_kernel
-                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
-                    d_keys_in,
-                    d_unique_out,
-                    d_values_in,
-                    d_aggregates_out,
-                    d_num_runs_out,
-                    tile_state,
-                    start_tile,
-                    equality_op,
-                    reduction_op,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig reduce_by_key_config;
-            InitConfigs(ptx_version, reduce_by_key_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_keys_in,
-                d_unique_out,
-                d_values_in,
-                d_aggregates_out,
-                d_num_runs_out,
-                equality_op,
-                reduction_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
-                reduce_by_key_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_rle.cuh b/external/cub/cub/device/dispatch/dispatch_rle.cuh
deleted file mode 100644
index 1de979e88cd..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_rle.cuh
+++ /dev/null
@@ -1,538 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_rle.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            ScanTileStateT,              ///< Tile status interface type
-    typename            EqualityOpT,                 ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
-__global__ void DeviceRleSweepKernel(
-    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
-    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
-    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-    ScanTileStateT              tile_status,        ///< [in] Tile status interface
-    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
-    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentRle<
-        AgentRlePolicyT,
-        InputIteratorT,
-        OffsetsOutputIteratorT,
-        LengthsOutputIteratorT,
-        EqualityOpT,
-        OffsetT> AgentRleT;
-
-    // Shared memory for AgentRle
-    __shared__ typename AgentRleT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_runs_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
- */
-template <
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            EqualityOpT,                ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-struct DeviceRleDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    // The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig&   device_rle_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_rle_config.template Init<PtxRleSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
-        }
-        else
-        {
-            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    store_warp_time_slicing;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename AgentRlePolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
-            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
-            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
-            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
-            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_warp_time_slicing,
-                scan_algorithm);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide run-length-encode using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
-        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_status,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for device_rle_sweep_kernel
-            int device_rle_kernel_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                device_rle_kernel_sm_occupancy,            // out
-                device_rle_sweep_kernel,
-                device_rle_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log device_rle_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
-
-            // Invoke device_rle_sweep_kernel
-            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                tile_status,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_rle_config;
-            InitConfigs(ptx_version, device_rle_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
-                device_rle_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_scan.cuh b/external/cub/cub/device/dispatch/dispatch_scan.cuh
deleted file mode 100644
index 8944dcd33e0..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_scan.cuh
+++ /dev/null
@@ -1,563 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_scan.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename            ScanTileStateT>     ///< Tile status interface type
-__global__ void DeviceScanInitKernel(
-    ScanTileStateT      tile_state,         ///< [in] Tile status interface
-    int                 num_tiles)          ///< [in] Number of tiles
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-}
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename                ScanTileStateT,         ///< Tile status interface type
-    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
-__global__ void DeviceCompactInitKernel(
-    ScanTileStateT          tile_state,             ///< [in] Tile status interface
-    int                     num_tiles,              ///< [in] Number of tiles
-    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-
-    // Initialize d_num_selected_out
-    if ((blockIdx.x == 0) && (threadIdx.x == 0))
-        *d_num_selected_out = 0;
-}
-
-
-/**
- * Scan kernel entry point (multi-block)
- */
-template <
-    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
-    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename            ScanTileStateT,     ///< Tile status interface type
-    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
-    typename            OffsetT>            ///< Signed integer type for global offsets
-__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
-__global__ void DeviceScanKernel(
-    InputIteratorT      d_in,               ///< Input data
-    OutputIteratorT     d_out,              ///< Output data
-    ScanTileStateT      tile_state,         ///< Tile status interface
-    int                 start_tile,         ///< The starting tile for the current grid
-    ScanOpT             scan_op,            ///< Binary scan functor 
-    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
-    OffsetT             num_items)          ///< Total number of scan items for the entire problem
-{
-    // Thread block type for scanning input tiles
-    typedef AgentScan<
-        ScanPolicyT,
-        InputIteratorT,
-        OutputIteratorT,
-        ScanOpT,
-        InitValueT,
-        OffsetT> AgentScanT;
-
-    // Shared memory for AgentScan
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
- */
-template <
-    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
-    typename OffsetT>            ///< Signed integer type for global offsets
-struct DispatchScan
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OutputT> ScanTileStateT;
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    /// SM600
-    struct Policy600
-    {
-        typedef AgentScanPolicy<
-            CUB_NOMINAL_CONFIG(128, 15, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    /// SM520
-    struct Policy520
-    {
-        // Titan X: 32.47B items/s @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    /// SM35
-    struct Policy350
-    {
-        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                BLOCK_SCAN_RAKING>
-            ScanPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(256, 9, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(96, 21, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            ScanPolicyT;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        typedef AgentScanPolicy<
-                CUB_NOMINAL_CONFIG(64, 9, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-    typedef Policy600 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 520)
-    typedef Policy520 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &scan_kernel_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        scan_kernel_config.template Init<PtxAgentScanPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
-        }
-        else if (ptx_version >= 520)
-        {
-            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
-        }
-        else if (ptx_version >= 350)
-        {
-            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
-        }
-        else
-        {
-            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide prefix scan using the
-     * specified kernel functions.
-     */
-    template <
-        typename            ScanInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
-        typename            ScanSweepKernelPtrT>    ///< Function type of cub::DeviceScanKernelPtrT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
-        ScanOpT             scan_op,                ///< [in] Binary scan functor 
-        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
-        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
-        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
-        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_out;
-        (void)scan_op;
-        (void)init_value;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)init_kernel;
-        (void)scan_kernel;
-        (void)scan_kernel_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_state,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for scan_kernel
-            int scan_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                scan_sm_occupancy,            // out
-                scan_kernel,
-                scan_kernel_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log scan_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
-
-                // Invoke scan_kernel
-                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
-                    d_in,
-                    d_out,
-                    tile_state,
-                    start_tile,
-                    scan_op,
-                    init_value,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                ///< [in] Binary scan functor 
-        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
-        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig scan_kernel_config;
-            InitConfigs(ptx_version, scan_kernel_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_out,
-                scan_op,
-                init_value,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<ScanTileStateT>,
-                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
-                scan_kernel_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_select_if.cuh b/external/cub/cub/device/dispatch/dispatch_select_if.cuh
deleted file mode 100644
index 6f033197c2d..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ /dev/null
@@ -1,542 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_select_if.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
-    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
-    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
-    typename            ScanTileStateT,             ///< Tile status interface type
-    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename            OffsetT,                    ///< Signed integer type for global offsets
-    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
-__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
-__global__ void DeviceSelectSweepKernel(
-    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
-    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
-    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-    ScanTileStateT          tile_status,            ///< [in] Tile status interface
-    SelectOpT               select_op,              ///< [in] Selection operator
-    EqualityOpT             equality_op,            ///< [in] Equality operator
-    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentSelectIf<
-        AgentSelectIfPolicyT,
-        InputIteratorT,
-        FlagsInputIteratorT,
-        SelectedOutputIteratorT,
-        SelectOpT,
-        EqualityOpT,
-        OffsetT,
-        KEEP_REJECTS> AgentSelectIfT;
-
-    // Shared memory for AgentSelectIf
-    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_selected_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
- */
-template <
-    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
-    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct DispatchSelectIf
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 10,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SelectIfPolicyT;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            SelectIfPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &select_if_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        select_if_config.template Init<PtxSelectIfPolicyT>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
-        }
-        else
-        {
-            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide selection using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
-        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
-        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
-        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_flags;
-        (void)d_selected_out;
-        (void)d_num_selected_out;
-        (void)select_op;
-        (void)equality_op;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)scan_init_kernel;
-        (void)select_if_kernel;
-        (void)select_if_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke scan_init_kernel to initialize tile descriptors
-            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_status,
-                num_tiles,
-                d_num_selected_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for select_if_kernel
-            int range_select_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_select_sm_occupancy,            // out
-                select_if_kernel,
-                select_if_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log select_if_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
-
-            // Invoke select_if_kernel
-            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                tile_status,
-                select_op,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig select_if_config;
-            InitConfigs(ptx_version, select_if_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                select_op,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
-                select_if_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
deleted file mode 100644
index 3417913c7d8..00000000000
--- a/external/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ /dev/null
@@ -1,834 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/single_pass_scan_operators.cuh"
-#include "../../agent/agent_segment_fixup.cuh"
-#include "../../agent/agent_spmv_orig.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * SpMV kernel entry points
- *****************************************************************************/
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for sequence offsets
-__global__ void DeviceSpmv1ColKernel(
-    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
-
-    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (row_idx < spmv_params.num_rows)
-    {
-        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
-        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
-
-        ValueT value = 0.0;
-        if (end_nonzero_idx != nonzero_idx)
-        {
-            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
-        }
-
-        spmv_params.d_vector_y[row_idx] = value;
-    }
-}
-
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
-    typename    OffsetT,                        ///< Signed integer type for sequence offsets
-    typename    CoordinateT,                    ///< Merge path coordinate type
-    typename    SpmvParamsT>                    ///< SpmvParams type
-__global__ void DeviceSpmvSearchKernel(
-    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
-    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
-    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    typedef CacheModifiedInputIterator<
-            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
-    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tile_idx < num_merge_tiles + 1)
-    {
-        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
-        CoordinateT                     tile_coordinate;
-        CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-        // Search the merge path
-        MergePathSearch(
-            diagonal,
-            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-            nonzero_indices,
-            spmv_params.num_rows,
-            spmv_params.num_nonzeros,
-            tile_coordinate);
-
-        // Output starting offset
-        d_tile_coordinates[tile_idx] = tile_coordinate;
-    }
-}
-
-
-/**
- * Spmv agent entry point
- */
-template <
-    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
-    typename        ScanTileStateT,             ///< Tile status interface type
-    typename        ValueT,                     ///< Matrix and vector value type
-    typename        OffsetT,                    ///< Signed integer type for sequence offsets
-    typename        CoordinateT,                ///< Merge path coordinate type
-    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
-    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
-__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
-__global__ void DeviceSpmvKernel(
-    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
-    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
-    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-    int                             num_tiles,                  ///< [in] Number of merge tiles
-    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
-    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
-{
-    // Spmv agent type specialization
-    typedef AgentSpmv<
-            SpmvPolicyT,
-            ValueT,
-            OffsetT,
-            HAS_ALPHA,
-            HAS_BETA>
-        AgentSpmvT;
-
-    // Shared memory for AgentSpmv
-    __shared__ typename AgentSpmvT::TempStorage temp_storage;
-
-    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
-        d_tile_coordinates,
-        d_tile_carry_pairs,
-        num_tiles);
-
-    // Initialize fixup tile status
-    tile_state.InitializeStatus(num_segment_fixup_tiles);
-
-}
-
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    typename    ScanTileStateT>                 ///< Tile status interface type
-__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
-__global__ void DeviceSegmentFixupKernel(
-    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
-    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
-    OffsetT                     num_items,          ///< [in] Total number of items to select from
-    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
-    ScanTileStateT              tile_state)         ///< [in] Tile status interface
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentSegmentFixup<
-            AgentSegmentFixupPolicyT,
-            PairsInputIteratorT,
-            AggregatesOutputIteratorT,
-            cub::Equality,
-            cub::Sum,
-            OffsetT>
-        AgentSegmentFixupT;
-
-    // Shared memory for AgentSegmentFixup
-    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
-        num_items,
-        num_tiles,
-        tile_state);
-}
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
- */
-template <
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchSpmv
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // SpmvParams bundle type
-    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
-
-    // 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    /// SM11
-    struct Policy110
-    {
-        typedef AgentSpmvPolicy<
-                128,
-                1,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM20
-    struct Policy200 
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                18,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_RAKING>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-
-    /// SM30
-    struct Policy300 
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                6,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-    /// SM35
-    struct Policy350
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 96 : 128,
-                (sizeof(ValueT) > 4) ? 4 : 7,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM37
-    struct Policy370
-    {
-
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 128 : 128,
-                (sizeof(ValueT) > 4) ? 9 : 14,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                false, 
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 6 : 7,
-                LOAD_LDG,
-                LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM60
-    struct Policy600
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 5 : 7,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-    typedef Policy600 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 370)
-    typedef Policy370 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
-    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &spmv_config,
-        KernelConfig    &segment_fixup_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        spmv_config.template Init<PtxSpmvPolicyT>();
-        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 500)
-        {
-            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 370)
-        {
-            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 350)
-        {
-            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
-
-        }
-        else if (ptx_version >= 200)
-        {
-            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
-        }
-        else
-        {
-            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using the
-     * specified kernel functions.
-     *
-     * If the input is larger than a single tile, this method uses two-passes of
-     * kernel invocations.
-     */
-    template <
-        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
-        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
-        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
-        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
-        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
-        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
-        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
-        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
-        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            if (spmv_params.num_cols == 1)
-            {
-                if (d_temp_storage == NULL)
-                {
-                    // Return if the caller is simply requesting the size of the storage allocation
-                    temp_storage_bytes = 1;
-                    break;
-                }
-
-                // Get search/init grid dims
-                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
-                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
-
-                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                break;
-            }
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Total number of spmv work items
-            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
-
-            // Tile sizes of kernels
-            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
-            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
-
-            // Number of tiles for kernels
-            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
-            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
-
-            // Get SM occupancy for kernels
-            int spmv_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                spmv_sm_occupancy,
-                spmv_kernel,
-                spmv_config.block_threads))) break;
-
-            int segment_fixup_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                segment_fixup_sm_occupancy,
-                segment_fixup_kernel,
-                segment_fixup_config.block_threads))) break;
-
-            // Get grid dimensions
-            dim3 spmv_grid_size(
-                CUB_MIN(num_merge_tiles, max_dim_x),
-                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            dim3 segment_fixup_grid_size(
-                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
-                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            // Get the temporary storage allocation requirements
-            size_t allocation_sizes[3];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
-            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
-            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            void* allocations[3];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Alias the other allocations
-            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
-            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
-
-            // Get search/init grid dims
-            int search_block_size   = INIT_KERNEL_THREADS;
-            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
-
-#if (CUB_PTX_ARCH == 0)
-            // Init textures
-            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
-#endif
-
-            if (search_grid_size < sm_count)
-//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
-            {
-                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
-                d_tile_coordinates = NULL;
-            }
-            else
-            {
-                // Use separate search kernel if we have enough spmv tiles to saturate the device
-
-                // Log spmv_search_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    search_grid_size, search_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
-                    num_merge_tiles,
-                    d_tile_coordinates,
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            // Log spmv_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
-
-            // Invoke spmv_kernel
-            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
-                spmv_params,
-                d_tile_coordinates,
-                d_tile_carry_pairs,
-                num_merge_tiles,
-                tile_state,
-                num_segment_fixup_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Run reduce-by-key fixup if necessary
-            if (num_merge_tiles > 1)
-            {
-                // Log segment_fixup_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
-
-                // Invoke segment_fixup_kernel
-                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
-                    d_tile_carry_pairs,
-                    spmv_params.d_vector_y,
-                    num_merge_tiles,
-                    num_segment_fixup_tiles,
-                    tile_state);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-#if (CUB_PTX_ARCH == 0)
-            // Free textures
-            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
-#endif
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig spmv_config, segment_fixup_config;
-            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
-
-            if (CubDebug(error = Dispatch(
-                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
-                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
-                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
-                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                spmv_config, segment_fixup_config))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/grid/grid_barrier.cuh b/external/cub/cub/grid/grid_barrier.cuh
deleted file mode 100644
index d9f83360b9e..00000000000
--- a/external/cub/cub/grid/grid_barrier.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-
-#pragma once
-
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-#include "../thread/thread_load.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-class GridBarrier
-{
-protected :
-
-    typedef unsigned int SyncFlag;
-
-    // Counters in global device memory
-    SyncFlag* d_sync;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrier() : d_sync(NULL) {}
-
-
-    /**
-     * Synchronize
-     */
-    __device__ __forceinline__ void Sync() const
-    {
-        volatile SyncFlag *d_vol_sync = d_sync;
-
-        // Threadfence and syncthreads to make sure global writes are visible before
-        // thread-0 reports in with its sync counter
-        __threadfence();
-        CTA_SYNC();
-
-        if (blockIdx.x == 0)
-        {
-            // Report in ourselves
-            if (threadIdx.x == 0)
-            {
-                d_vol_sync[blockIdx.x] = 1;
-            }
-
-            CTA_SYNC();
-
-            // Wait for everyone else to report in
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-
-            // Let everyone know it's safe to proceed
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                d_vol_sync[peer_block] = 0;
-            }
-        }
-        else
-        {
-            if (threadIdx.x == 0)
-            {
-                // Report in
-                d_vol_sync[blockIdx.x] = 1;
-
-                // Wait for acknowledgment
-                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-        }
-    }
-};
-
-
-/**
- * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
- *
- * Uses RAII for lifetime, i.e., device resources are reclaimed when
- * the destructor is called.
- */
-class GridBarrierLifetime : public GridBarrier
-{
-protected:
-
-    // Number of bytes backed by d_sync
-    size_t sync_bytes;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
-
-
-    /**
-     * DeviceFrees and resets the progress counters
-     */
-    cudaError_t HostReset()
-    {
-        cudaError_t retval = cudaSuccess;
-        if (d_sync)
-        {
-            CubDebug(retval = cudaFree(d_sync));
-            d_sync = NULL;
-        }
-        sync_bytes = 0;
-        return retval;
-    }
-
-
-    /**
-     * Destructor
-     */
-    virtual ~GridBarrierLifetime()
-    {
-        HostReset();
-    }
-
-
-    /**
-     * Sets up the progress counters for the next kernel launch (lazily
-     * allocating and initializing them if necessary)
-     */
-    cudaError_t Setup(int sweep_grid_size)
-    {
-        cudaError_t retval = cudaSuccess;
-        do {
-            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
-            if (new_sync_bytes > sync_bytes)
-            {
-                if (d_sync)
-                {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
-                }
-
-                sync_bytes = new_sync_bytes;
-
-                // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
-            }
-        } while (0);
-
-        return retval;
-    }
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/grid/grid_even_share.cuh b/external/cub/cub/grid/grid_even_share.cuh
deleted file mode 100644
index 3ba29da7ae6..00000000000
--- a/external/cub/cub/grid/grid_even_share.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
- */
-
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-#include "grid_mapping.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridEvenShare is a descriptor utility for distributing input among
- * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
- * the same number of input tiles.
- *
- * \par Overview
- * Each thread block is assigned a consecutive sequence of input tiles.  To help
- * preserve alignment and eliminate the overhead of guarded loads for all but the
- * last thread block, to GridEvenShare assigns one of three different amounts of
- * work to a given thread block: "big", "normal", or "last".  The "big" workloads
- * are one scheduling grain larger than "normal".  The "last" work unit for the
- * last thread block may be partially-full if the input is not an even multiple of
- * the scheduling grain size.
- *
- * \par
- * Before invoking a child grid, a parent thread will typically construct an
- * instance of GridEvenShare.  The instance can be passed to child thread blocks
- * which can initialize their per-thread block offsets using \p BlockInit().
- */
-template <typename OffsetT>
-struct GridEvenShare
-{
-private:
-
-    OffsetT     total_tiles;
-    int         big_shares;
-    OffsetT     big_share_items;
-    OffsetT     normal_share_items;
-    OffsetT     normal_base_offset;
-
-public:
-
-    /// Total number of input items
-    OffsetT     num_items;
-
-    /// Grid size in thread blocks
-    int         grid_size;
-
-    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
-    OffsetT     block_offset;
-
-    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
-    OffsetT     block_end;
-
-    /// Stride between input tiles
-    OffsetT     block_stride;
-
-
-    /**
-     * \brief Constructor.
-     */
-    __host__ __device__ __forceinline__ GridEvenShare() :
-        total_tiles(0),
-        big_shares(0),
-        big_share_items(0),
-        normal_share_items(0),
-        normal_base_offset(0),
-        num_items(0),
-        grid_size(0),
-        block_offset(0),
-        block_end(0),
-        block_stride(0)
-    {}
-
-
-    /**
-     * \brief Dispatch initializer. To be called prior prior to kernel launch.
-     */
-    __host__ __device__ __forceinline__ void DispatchInit(
-        OffsetT num_items,          ///< Total number of input items
-        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
-        int     tile_items)         ///< Number of data items per input tile
-    {
-        this->block_offset          = num_items;    // Initialize past-the-end
-        this->block_end             = num_items;    // Initialize past-the-end
-        this->num_items             = num_items;
-        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
-        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
-        OffsetT avg_tiles_per_block = total_tiles / grid_size;
-        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
-        this->normal_share_items    = avg_tiles_per_block * tile_items;
-        this->normal_base_offset    = big_shares * tile_items;
-        this->big_share_items       = normal_share_items + tile_items;
-    }
-
-
-    /**
-     * \brief Initializes ranges for the specified thread block index.  Specialized
-     * for a "raking" access pattern in which each thread block is assigned a
-     * consecutive sequence of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
-    {
-        block_stride = TILE_ITEMS;
-        if (block_id < big_shares)
-        {
-            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
-            block_offset = (block_id * big_share_items);
-            block_end = block_offset + big_share_items;
-        }
-        else if (block_id < total_tiles)
-        {
-            // This thread block gets a normal share of grains (avg_tiles_per_block)
-            block_offset = normal_base_offset + (block_id * normal_share_items);
-            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
-        }
-        // Else default past-the-end
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
-    {
-        block_stride = grid_size * TILE_ITEMS;
-        block_offset = (block_id * TILE_ITEMS);
-        block_end = num_items;
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for "strip mining" access
-     * pattern in which the input tiles assigned to each thread block are
-     * separated by a stride equal to the the extent of the grid.
-     */
-    template <
-        int TILE_ITEMS,
-        GridMappingStrategy STRATEGY>
-    __device__ __forceinline__ void BlockInit()
-    {
-        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        this->block_offset = block_offset;
-        this->block_end = block_end;
-        this->block_stride = TILE_ITEMS;
-    }
-
-
-};
-
-
-
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/grid/grid_mapping.cuh b/external/cub/cub/grid/grid_mapping.cuh
deleted file mode 100644
index 6cd89209f83..00000000000
--- a/external/cub/cub/grid/grid_mapping.cuh
+++ /dev/null
@@ -1,113 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/******************************************************************************
- * Mapping policies
- *****************************************************************************/
-
-
-/**
- * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-enum GridMappingStrategy
-{
-    /**
-     * \brief An a "raking" access pattern in which each thread block is
-     * assigned a consecutive sequence of input tiles
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p segments, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each segment is comprised of
-     * consecutive tiles, where a tile is a small, constant-sized unit of input
-     * to be processed to completion before the thread block terminates or
-     * obtains more work.  The kernel invokes \p p thread blocks, each
-     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
-     * in tile-size increments.
-     */
-    GRID_MAPPING_RAKE,
-
-    /**
-     * \brief An a "strip mining" access pattern in which the input tiles assigned
-     * to each thread block are separated by a stride equal to the the extent of
-     * the grid.
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p sets, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each set is comprised of
-     * data tiles separated by stride \p tiles, where a tile is a small,
-     * constant-sized unit of input to be processed to completion before the
-     * thread block terminates or obtains more work.  The kernel invokes \p p
-     * thread blocks, each of which iteratively consumes a segment of
-     * <em>n</em>/<em>p</em> elements in tile-size increments.
-     */
-    GRID_MAPPING_STRIP_MINE,
-
-    /**
-     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is treated as a queue to be dynamically consumed by a grid of
-     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
-     * unit of input to be processed to completion before the thread block
-     * terminates or obtains more work.  The grid size \p p is constant,
-     * loosely corresponding to the number of thread blocks that may actively
-     * reside on the target device.
-     */
-    GRID_MAPPING_DYNAMIC,
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/grid/grid_queue.cuh b/external/cub/cub/grid/grid_queue.cuh
deleted file mode 100644
index f413c6d2c4a..00000000000
--- a/external/cub/cub/grid/grid_queue.cuh
+++ /dev/null
@@ -1,220 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridQueue is a descriptor utility for dynamic queue management.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_debug.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
- *
- * \par Overview
- * GridQueue descriptors provides abstractions for "filling" or
- * "draining" globally-shared vectors.
- *
- * \par
- * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
- * returning a unique offset for the calling thread to write its items.
- * The GridQueue maintains the total "fill-size".  The fill counter must be reset
- * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
- * will be filling.
- *
- * \par
- * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
- * zero-initialized counter, returning a unique offset for the calling thread to
- * read its items. Threads can safely drain until the array's logical fill-size is
- * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
- * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
- * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
- * is simply the number of elements in the array.)
- *
- * \par
- * Iterative work management can be implemented simply with a pair of flip-flopping
- * work buffers, each with an associated set of fill and drain GridQueue descriptors.
- *
- * \tparam OffsetT Signed integer type for global offsets
- */
-template <typename OffsetT>
-class GridQueue
-{
-private:
-
-    /// Counter indices
-    enum
-    {
-        FILL    = 0,
-        DRAIN   = 1,
-    };
-
-    /// Pair of counters
-    OffsetT *d_counters;
-
-public:
-
-    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
-    __host__ __device__ __forceinline__
-    static size_t AllocationSize()
-    {
-        return sizeof(OffsetT) * 2;
-    }
-
-
-    /// Constructs an invalid GridQueue descriptor
-    __host__ __device__ __forceinline__ GridQueue()
-    :
-        d_counters(NULL)
-    {}
-
-
-    /// Constructs a GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((OffsetT*) d_storage)
-    {}
-
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
-        OffsetT fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = fill_size;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        OffsetT counters[2];
-        counters[FILL] = fill_size;
-        counters[DRAIN] = 0;
-        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
-#endif
-    }
-
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
-#endif
-    }
-
-
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
-    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
-#endif
-    }
-
-
-    /// Returns the fill-size established by the parent or by the previous kernel.
-    __host__ __device__ __forceinline__ cudaError_t FillSize(
-        OffsetT &fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        fill_size = d_counters[FILL];
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
-#endif
-    }
-
-
-    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + DRAIN, num_items);
-    }
-
-
-    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + FILL, num_items);
-    }
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reset grid queue (call with 1 block of 1 thread)
- */
-template <typename OffsetT>
-__global__ void FillAndResetDrainKernel(
-    GridQueue<OffsetT>   grid_queue,
-    OffsetT              num_items)
-{
-    grid_queue.FillAndResetDrain(num_items);
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/external/cub/cub/host/mutex.cuh b/external/cub/cub/host/mutex.cuh
deleted file mode 100644
index 0054f1f916d..00000000000
--- a/external/cub/cub/host/mutex.cuh
+++ /dev/null
@@ -1,171 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple portable mutex
- */
-
-
-#pragma once
-
-#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-    #include <mutex>
-#else
-    #if defined(_WIN32) || defined(_WIN64)
-        #include <intrin.h>
-
-        #define WIN32_LEAN_AND_MEAN
-        #define NOMINMAX
-        #include <windows.h>
-        #undef WIN32_LEAN_AND_MEAN
-        #undef NOMINMAX
-
-        /**
-         * Compiler read/write barrier
-         */
-        #pragma intrinsic(_ReadWriteBarrier)
-
-    #endif
-#endif
-
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Simple portable mutex
- *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
- *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
- */
-struct Mutex
-{
-#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-
-    std::mutex mtx;
-
-    void Lock()
-    {
-        mtx.lock();
-    }
-
-    void Unlock()
-    {
-        mtx.unlock();
-    }
-
-    void TryLock()
-    {
-        mtx.try_lock();
-    }
-
-#else       //__cplusplus > 199711L
-
-    #if defined(_MSC_VER)
-
-        // Microsoft VC++
-        typedef long Spinlock;
-
-    #else
-
-        // GNU g++
-        typedef int Spinlock;
-
-        /**
-         * Compiler read/write barrier
-         */
-        __forceinline__ void _ReadWriteBarrier()
-        {
-            __sync_synchronize();
-        }
-
-        /**
-         * Atomic exchange
-         */
-        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
-        {
-            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
-            _ReadWriteBarrier();
-            return __sync_lock_test_and_set(Target, Value);
-        }
-
-        /**
-         * Pause instruction to prevent excess processor bus usage
-         */
-        __forceinline__ void YieldProcessor()
-        {
-        }
-
-    #endif  // defined(_MSC_VER)
-
-        /// Lock member
-        volatile Spinlock lock;
-
-        /**
-         * Constructor
-         */
-        Mutex() : lock(0) {}
-
-        /**
-         * Return when the specified spinlock has been acquired
-         */
-        __forceinline__ void Lock()
-        {
-            while (1)
-            {
-                if (!_InterlockedExchange(&lock, 1)) return;
-                while (lock) YieldProcessor();
-            }
-        }
-
-
-        /**
-         * Release the specified spinlock
-         */
-        __forceinline__ void Unlock()
-        {
-            _ReadWriteBarrier();
-            lock = 0;
-        }
-
-#endif      // __cplusplus > 199711L
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/external/cub/cub/iterator/arg_index_input_iterator.cuh b/external/cub/cub/iterator/arg_index_input_iterator.cuh
deleted file mode 100644
index d3bce583d8c..00000000000
--- a/external/cub/cub/iterator/arg_index_input_iterator.cuh
+++ /dev/null
@@ -1,259 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#include <thrust/version.h>
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
- *
- * \par Overview
- * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
- *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
- *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
- * dereference an array of doubles
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::ArgIndexInputIterator<double*> itr(d_in);
- *
- * // Within device code:
- * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
- * Tuple item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 8.0 @ 0
- *
- * itr = itr + 6;
- * item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 9.0 @ 6
- *
- * \endcode
- *
- * \tparam InputIteratorT       The value type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
- */
-template <
-    typename    InputIteratorT,
-    typename    OffsetT             = ptrdiff_t,
-    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
-class ArgIndexInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ArgIndexInputIterator                       self_type;              ///< My own type
-    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
-    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    InputIteratorT  itr;
-    difference_type offset;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArgIndexInputIterator(
-        InputIteratorT  itr,            ///< Input iterator to wrap
-        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
-    :
-        itr(itr),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        value_type retval;
-        retval.value = itr[offset];
-        retval.key = offset;
-        return retval;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(itr, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(itr, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((itr == rhs.itr) && (offset == rhs.offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((itr != rhs.itr) || (offset != rhs.offset));
-    }
-
-    /// Normalize
-    __host__ __device__ __forceinline__ void normalize()
-    {
-        itr += offset;
-        offset = 0;
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/cache_modified_input_iterator.cuh b/external/cub/cub/iterator/cache_modified_input_iterator.cuh
deleted file mode 100644
index 0c0252c8b1a..00000000000
--- a/external/cub/cub/iterator/cache_modified_input_iterator.cuh
+++ /dev/null
@@ -1,240 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
- *
- * \par Overview
- * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by reading \p ValueType values through loads modified by \p MODIFIER.
- * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
- *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
- * dereference a device array of double using the "ldg" PTX load modifier
- * (i.e., load values through texture cache).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 8.0
- * printf("%f\n", itr[1]);  // 6.0
- * printf("%f\n", itr[6]);  // 9.0
- *
- * \endcode
- *
- * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedInputIterator          self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-
-public:
-
-    /// Wrapped native pointer
-    ValueType* ptr;
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __device__ __forceinline__ reference operator*() const
-    {
-        return ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return ThreadLoad<MODIFIER>(ptr + n);
-    }
-
-    /// Structure dereference
-    __device__ __forceinline__ pointer operator->()
-    {
-        return &ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/cache_modified_output_iterator.cuh b/external/cub/cub/iterator/cache_modified_output_iterator.cuh
deleted file mode 100644
index 8dbaafa61c5..00000000000
--- a/external/cub/cub/iterator/cache_modified_output_iterator.cuh
+++ /dev/null
@@ -1,254 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
- *
- * \par Overview
- * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by writing \p ValueType values through stores modified by \p MODIFIER.
- * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
- *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
- * dereference a device array of doubles using the "wt" PTX load modifier
- * (i.e., write-through to system memory).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_out;              // e.g., [, , , , , , ]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
- *
- * // Within device code:
- * itr[0]  = 8.0;
- * itr[1]  = 66.0;
- * itr[55] = 24.0;
- *
- * \endcode
- *
- * \par Usage Considerations
- * - Can only be dereferenced within device code
- *
- * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedOutputIterator
-{
-private:
-
-    // Proxy object
-    struct Reference
-    {
-        ValueType* ptr;
-
-        /// Constructor
-        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
-
-        /// Assignment
-        __device__ __forceinline__ ValueType operator =(ValueType val)
-        {
-            ThreadStore<MODIFIER>(ptr, val);
-            return val;
-        }
-    };
-
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                                value_type;             ///< The type of the element the iterator can point to
-    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType* ptr;
-
-public:
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return Reference(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return Reference(ptr + n);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/constant_input_iterator.cuh b/external/cub/cub/iterator/constant_input_iterator.cuh
deleted file mode 100644
index 0b7af478d74..00000000000
--- a/external/cub/cub/iterator/constant_input_iterator.cuh
+++ /dev/null
@@ -1,235 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of homogeneous values
- *
- * \par Overview
- * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
- *   of type \p ValueType.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ConstantInputIteratorTto
- * dereference a sequence of homogeneous doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
- *
- * cub::ConstantInputIterator<double> itr(5.0);
- *
- * printf("%f\n", itr[0]);      // 5.0
- * printf("%f\n", itr[1]);      // 5.0
- * printf("%f\n", itr[2]);      // 5.0
- * printf("%f\n", itr[50]);     // 5.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class ConstantInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ConstantInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType   val;
-    OffsetT     offset;
-#ifdef _WIN32
-    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantInputIterator(
-        ValueType   val,            ///< Starting value for the iterator instance to report
-        OffsetT     offset = 0)     ///< Base offset
-    :
-        val(val),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
-    {
-        return val;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset) && ((val == rhs.val));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset) || (val!= rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "," << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/counting_input_iterator.cuh b/external/cub/cub/iterator/counting_input_iterator.cuh
deleted file mode 100644
index 3b42a00d181..00000000000
--- a/external/cub/cub/iterator/counting_input_iterator.cuh
+++ /dev/null
@@ -1,228 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
- *
- * \par Overview
- * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
- *   at \p offset will return the value \p base + \p offset.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CountingInputIteratorTto
- * dereference a sequence of incrementing integers.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
- *
- * cub::CountingInputIterator<int> itr(5);
- *
- * printf("%d\n", itr[0]);      // 5
- * printf("%d\n", itr[1]);      // 6
- * printf("%d\n", itr[2]);      // 7
- * printf("%d\n", itr[50]);     // 55
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class CountingInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CountingInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType val;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CountingInputIterator(
-        const ValueType &val)          ///< Starting value for the iterator instance to report
-    :
-        val(val)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        val++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        val++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val + (ValueType) n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        val += (ValueType) n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val - (ValueType) n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        val -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return (difference_type) (val - other.val);
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return val + (ValueType) n;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (val == rhs.val);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (val != rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "]";
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/discard_output_iterator.cuh b/external/cub/cub/iterator/discard_output_iterator.cuh
deleted file mode 100644
index 1fca08c062d..00000000000
--- a/external/cub/cub/iterator/discard_output_iterator.cuh
+++ /dev/null
@@ -1,220 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A discard iterator
- */
-template <typename OffsetT = ptrdiff_t>
-class DiscardOutputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef DiscardOutputIterator   self_type;              ///< My own type
-    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                    value_type;             ///< The type of the element the iterator can point to
-    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    OffsetT offset;
-
-#if defined(_WIN32) || !defined(_WIN64)
-    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ DiscardOutputIterator(
-        OffsetT offset = 0)     ///< Base offset
-    :
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ self_type& operator*()
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return;
-    }
-
-    /// Assignment to self (no-op)
-    __host__ __device__ __forceinline__ void operator=(self_type const& other)
-    {
-        offset = other.offset;
-    }
-
-    /// Assignment to anything else (no-op)
-    template<typename T>
-    __host__ __device__ __forceinline__ void operator=(T const&)
-    {}
-
-    /// Cast to void* operator
-    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/tex_obj_input_iterator.cuh b/external/cub/cub/iterator/tex_obj_input_iterator.cuh
deleted file mode 100644
index 623609452fd..00000000000
--- a/external/cub/cub/iterator/tex_obj_input_iterator.cuh
+++ /dev/null
@@ -1,310 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
- *
- * \par Overview
- * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
- *   created by the host thread, but can be used by any descendant kernel.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIteratorTto
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexObjInputIterator<double> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    typename    OffsetT = ptrdiff_t>
-class TexObjInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexObjInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    // Largest texture word we can use in device
-    typedef typename UnitWord<T>::TextureWord TextureWord;
-
-    // Number of texture words per T
-    enum {
-        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-    };
-
-private:
-
-    T*                  ptr;
-    difference_type     tex_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TexObjInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0),
-        tex_obj(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
-        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        this->tex_offset = tex_offset;
-
-        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
-        cudaResourceDesc        res_desc;
-        cudaTextureDesc         tex_desc;
-        memset(&res_desc, 0, sizeof(cudaResourceDesc));
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        res_desc.resType                = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr      = this->ptr;
-        res_desc.res.linear.desc        = channel_desc;
-        res_desc.res.linear.sizeInBytes = bytes;
-        tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return cudaDestroyTextureObject(tex_obj);
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
-        }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/iterator/tex_ref_input_iterator.cuh b/external/cub/cub/iterator/tex_ref_input_iterator.cuh
deleted file mode 100644
index da1fd166177..00000000000
--- a/external/cub/cub/iterator/tex_ref_input_iterator.cuh
+++ /dev/null
@@ -1,374 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
-
-#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Static file-scope Tesla/Fermi-style texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Global texture reference specialized by type
-template <typename T>
-struct IteratorTexRef
-{
-    /// And by unique ID
-    template <int UNIQUE_ID>
-    struct TexId
-    {
-        // Largest texture word we can use in device
-        typedef typename UnitWord<T>::DeviceWord DeviceWord;
-        typedef typename UnitWord<T>::TextureWord TextureWord;
-
-        // Number of texture words per T
-        enum {
-            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
-            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-        };
-
-        // Texture reference type
-        typedef texture<TextureWord> TexRef;
-
-        // Texture reference
-        static TexRef ref;
-
-        /// Bind texture
-        static cudaError_t BindTexture(void *d_in, size_t &offset)
-        {
-            if (d_in)
-            {
-                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
-                ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
-            }
-
-            return cudaSuccess;
-        }
-
-        /// Unbind texture
-        static cudaError_t UnbindTexture()
-        {
-            return CubDebug(cudaUnbindTexture(ref));
-        }
-
-        /// Fetch element
-        template <typename Distance>
-        static __device__ __forceinline__ T Fetch(Distance tex_offset)
-        {
-            DeviceWord temp[DEVICE_MULTIPLE];
-            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
-
-            #pragma unroll
-            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-            {
-                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
-            }
-
-            return reinterpret_cast<T&>(temp);
-        }
-    };
-};
-
-// Texture reference definitions
-template <typename  T>
-template <int       UNIQUE_ID>
-typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
-
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
- *
- * \par Overview
- * - TexRefInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
- *   reference.  Only one TexRefInputIteratorTinstance can be bound at any given time for a
- *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
- *   thread, and (4) compilation .o unit.
- * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be
- *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
- *   from the host).
- * - Compatible with Thrust API v1.7 or newer.
- * - Compatible with CUDA toolkit v5.5 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIteratorTto
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexRefInputIterator<double, __LINE__> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    int         UNIQUE_ID,
-    typename    OffsetT = ptrdiff_t>
-class TexRefInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexRefInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    T*              ptr;
-    difference_type tex_offset;
-
-    // Texture reference wrapper (old Tesla/Fermi-style textures)
-    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
-
-public:
-/*
-    /// Constructor
-    __host__ __device__ __forceinline__ TexRefInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0)
-    {}
-*/
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
-        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        size_t offset;
-        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
-        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
-        return retval;
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return TexId::UnbindTexture();
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Use the texture reference
-        return TexId::Fetch(tex_offset);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-#endif // CUDA_VERSION
diff --git a/external/cub/cub/iterator/transform_input_iterator.cuh b/external/cub/cub/iterator/transform_input_iterator.cuh
deleted file mode 100644
index 39258a40c9b..00000000000
--- a/external/cub/cub/iterator/transform_input_iterator.cuh
+++ /dev/null
@@ -1,252 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for transforming dereferenced values.
- *
- * \par Overview
- * - TransformInputIteratorTwraps a unary conversion functor of type \p
- *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
- *   using the former to produce references of type \p ValueType from the latter.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TransformInputIteratorTto
- * dereference an array of integers, tripling the values and converting them to doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
- *
- * // Functor for tripling integer values and converting to doubles
- * struct TripleDoubler
- * {
- *     __host__ __device__ __forceinline__
- *     double operator()(const int &a) const {
- *         return double(a * 3);
- *     }
- * };
- *
- * // Declare, allocate, and initialize a device array
- * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
- * TripleDoubler conversion_op;
- *
- * // Create an iterator wrapper
- * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 24.0
- * printf("%f\n", itr[1]);  // 18.0
- * printf("%f\n", itr[6]);  // 27.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
- * \tparam InputIteratorT       The type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- *
- */
-template <
-    typename ValueType,
-    typename ConversionOp,
-    typename InputIteratorT,
-    typename OffsetT = ptrdiff_t>
-class TransformInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TransformInputIterator              self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ConversionOp    conversion_op;
-    InputIteratorT  input_itr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TransformInputIterator(
-        InputIteratorT      input_itr,          ///< Input iterator to wrap
-        ConversionOp        conversion_op)      ///< Conversion functor to wrap
-    :
-        conversion_op(conversion_op),
-        input_itr(input_itr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        input_itr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        input_itr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return conversion_op(*input_itr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(input_itr + n, conversion_op);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        input_itr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(input_itr - n, conversion_op);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        input_itr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return input_itr - other.input_itr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return conversion_op(input_itr[n]);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &conversion_op(*input_itr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (input_itr == rhs.input_itr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (input_itr != rhs.input_itr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_load.cuh b/external/cub/cub/thread/thread_load.cuh
deleted file mode 100644
index 9de4bd4149b..00000000000
--- a/external/cub/cub/thread/thread_load.cuh
+++ /dev/null
@@ -1,438 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for reading memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include <iterator>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory load operations.
- */
-enum CacheLoadModifier
-{
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
- *
- * // 32-bit load using cache-global modifier:
- * int *d_in;
- * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
- *
- * // 16-bit load using default modifier
- * short *d_in;
- * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
- *
- * // 256-bit load using cache-volatile modifier
- * double4 *d_in;
- * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
- *
- * // 96-bit load using cache-streaming modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated load iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadLoad
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
-    {
-        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
-        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
-    }
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
-    {
-        vals[COUNT] = itr[COUNT];
-        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
-    }
-};
-
-
-/// Helper structure for templated load iteration (termination case)
-template <int MAX>
-struct IterateThreadLoad<MAX, MAX>
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
-    {                                                                                       \
-        uint4 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y),                                                                 \
-            "=r"(retval.z),                                                                 \
-            "=r"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
-    {                                                                                       \
-        ulonglong2 retval;                                                                  \
-        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
-            "=l"(retval.x),                                                                 \
-            "=l"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
-    {                                                                                       \
-        ushort4 retval;                                                                     \
-        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
-            "=h"(retval.x),                                                                 \
-            "=h"(retval.y),                                                                 \
-            "=h"(retval.z),                                                                 \
-            "=h"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
-    {                                                                                       \
-        uint2 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
-    {                                                                                       \
-        unsigned long long retval;                                                          \
-        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
-            "=l"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
-    {                                                                                       \
-        unsigned int retval;                                                                \
-        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
-            "=r"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
-        "    cvt.u16.u8 %0, datum;"                                                         \
-        "}" :                                                                               \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return (unsigned char) retval;                                                      \
-    }
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
- */
-#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
-    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
-    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_LOAD_ALL(LOAD_CA, ca)
-    _CUB_LOAD_ALL(LOAD_CG, cg)
-    _CUB_LOAD_ALL(LOAD_CS, cs)
-    _CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    _CUB_LOAD_ALL(LOAD_CA, global)
-    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
-    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
-    _CUB_LOAD_ALL(LOAD_CS, global)
-    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-
-#if CUB_PTX_ARCH >= 350
-    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#else
-    _CUB_LOAD_ALL(LOAD_LDG, global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_LOAD_ALL
-#undef _CUB_LOAD_1
-#undef _CUB_LOAD_2
-#undef _CUB_LOAD_4
-#undef _CUB_LOAD_8
-#undef _CUB_LOAD_16
-
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
- */
-template <typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
-    InputIteratorT          itr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<false>         /*is_pointer*/)
-{
-    return *itr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    return *ptr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<true>          /*is_primitive*/)
-{
-    T retval = *reinterpret_cast<volatile T*>(ptr);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<false>         /*is_primitive*/)
-{
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-/*
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-*/
-
-    T retval;
-    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_VOLATILE> /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ T ThreadLoad(
-    T const                 *ptr,
-    Int2Type<MODIFIER>      /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoad(
-        itr,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<InputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_operators.cuh b/external/cub/cub/thread/thread_operators.cuh
deleted file mode 100644
index 2bd5403e864..00000000000
--- a/external/cub/cub/thread/thread_operators.cuh
+++ /dev/null
@@ -1,317 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple binary operator functor types
- */
-
-/******************************************************************************
- * Simple functor operators
- ******************************************************************************/
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \brief Default equality functor
- */
-struct Equality
-{
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a == b;
-    }
-};
-
-
-/**
- * \brief Default inequality functor
- */
-struct Inequality
-{
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a != b;
-    }
-};
-
-
-/**
- * \brief Inequality functor (wraps equality functor)
- */
-template <typename EqualityOp>
-struct InequalityWrapper
-{
-    /// Wrapped equality operator
-    EqualityOp op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    InequalityWrapper(EqualityOp op) : op(op) {}
-
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return !op(a, b);
-    }
-};
-
-
-/**
- * \brief Default sum functor
- */
-struct Sum
-{
-    /// Boolean sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return a + b;
-    }
-};
-
-
-/**
- * \brief Default max functor
- */
-struct Max
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-/**
- * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
- */
-struct ArgMax
-{
-    /// Boolean max operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default min functor
- */
-struct Min
-{
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MIN(a, b);
-    }
-};
-
-
-/**
- * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
- */
-struct ArgMin
-{
-    /// Boolean min operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default cast functor
- */
-template <typename B>
-struct CastOp
-{
-    /// Cast operator, returns <tt>(B) a</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a) const
-    {
-        return (B) a;
-    }
-};
-
-
-/**
- * \brief Binary operator wrapper for switching non-commutative scan arguments
- */
-template <typename ScanOp>
-class SwizzleScanOp
-{
-private:
-
-    /// Wrapped scan operator
-    ScanOp scan_op;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-    /// Switch the scan arguments
-    template <typename T>
-    __host__ __device__ __forceinline__
-    T operator()(const T &a, const T &b)
-    {
-      T _a(a);
-      T _b(b);
-
-      return scan_op(_b, _a);
-    }
-};
-
-
-/**
- * \brief Reduce-by-segment functor.
- *
- * Given two cub::KeyValuePair inputs \p a and \p b and a
- * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
- * an instance of this functor returns a cub::KeyValuePair whose \p key
- * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
- * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
- *
- * ReduceBySegmentOp is an associative, non-commutative binary combining operator
- * for input sequences of cub::KeyValuePair pairings.  Such
- * sequences are typically used to represent a segmented set of values to be reduced
- * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
- * first value of each segment.
- *
- */
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceBySegmentOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,         ///< First partial reduction
-        const KeyValuePairT &second)        ///< Second partial reduction
-    {
-        KeyValuePairT retval;
-        retval.key = first.key + second.key;
-        retval.value = (second.key) ?
-                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
-        return retval;
-    }
-};
-
-
-
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceByKeyOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,       ///< First partial reduction
-        const KeyValuePairT &second)      ///< Second partial reduction
-    {
-        KeyValuePairT retval = second;
-
-        if (first.key == second.key)
-            retval.value = op(first.value, retval.value);
-
-        return retval;
-    }
-};
-
-
-
-
-
-
-
-/** @} */       // end group UtilModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_reduce.cuh b/external/cub/cub/thread/thread_reduce.cuh
deleted file mode 100644
index 9e277050236..00000000000
--- a/external/cub/cub/thread/thread_reduce.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential reduction over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-/**
- * Sequential reduction over statically-sized array types
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<LENGTH>    /*length*/)
-{
-    T retval = prefix;
-
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-        retval = reduction_op(retval, input[i]);
-
-    return retval;
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    T prefix = input[0];
-    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Serial reduction with the specified operator
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    return ThreadReduce<LENGTH>((T*) input, reduction_op);
-}
-
-
-}               // internal namespace
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_scan.cuh b/external/cub/cub/thread/thread_scan.cuh
deleted file mode 100644
index 545b4141918..00000000000
--- a/external/cub/cub/thread/thread_scan.cuh
+++ /dev/null
@@ -1,268 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential prefix scan over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \name Sequential prefix scan over statically-sized array types
- * @{
- */
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(exclusive, input[i]);
-        output[i] = exclusive;
-        exclusive = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = prefix;
-    T exclusive = inclusive;
-
-    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-
-
-
-
-
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(inclusive, input[i]);
-        output[i] = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    T inclusive = input[0];
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilModule
-
-
-}               // internal namespace
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_search.cuh b/external/cub/cub/thread/thread_search.cuh
deleted file mode 100644
index 379a08a51e7..00000000000
--- a/external/cub/cub/thread/thread_search.cuh
+++ /dev/null
@@ -1,154 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential search
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Computes the begin offsets into A and B for the specific diagonal
- */
-template <
-    typename AIteratorT,
-    typename BIteratorT,
-    typename OffsetT,
-    typename CoordinateT>
-__host__ __device__ __forceinline__ void MergePathSearch(
-    OffsetT         diagonal,
-    AIteratorT      a,
-    BIteratorT      b,
-    OffsetT         a_len,
-    OffsetT         b_len,
-    CoordinateT&    path_coordinate)
-{
-    /// The value type of the input iterator
-    typedef typename std::iterator_traits<AIteratorT>::value_type T;
-
-    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
-    OffsetT split_max = CUB_MIN(diagonal, a_len);
-
-    while (split_min < split_max)
-    {
-        OffsetT split_pivot = (split_min + split_max) >> 1;
-        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
-        {
-            // Move candidate split range up A, down B
-            split_min = split_pivot + 1;
-        }
-        else
-        {
-            // Move candidate split range up B, down A
-            split_max = split_pivot;
-        }
-    }
-
-    path_coordinate.x = CUB_MIN(split_min, a_len);
-    path_coordinate.y = diagonal - split_min;
-}
-
-
-
-/**
- * \brief Returns the offset of the first value within \p input which does not compare less than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT LowerBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (input[retval + half] < val)
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-        else
-        {
-            num_items = half;
-        }
-    }
-
-    return retval;
-}
-
-
-/**
- * \brief Returns the offset of the first value within \p input which compares greater than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT UpperBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (val < input[retval + half])
-        {
-            num_items = half;
-        }
-        else
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-    }
-
-    return retval;
-}
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/thread/thread_store.cuh b/external/cub/cub/thread/thread_store.cuh
deleted file mode 100644
index 14ee84d9270..00000000000
--- a/external/cub/cub/thread/thread_store.cuh
+++ /dev/null
@@ -1,422 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for writing memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory store operations.
- */
-enum CacheStoreModifier
-{
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
- *
- * // 32-bit store using cache-global modifier:
- * int *d_out;
- * int val;
- * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
- *
- * // 16-bit store using default modifier
- * short *d_out;
- * short val;
- * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
- *
- * // 256-bit store using write-through modifier
- * double4 *d_out;
- * double4 val;
- * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
- *
- * // 96-bit store using cache-streaming cache modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val;
- * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
- * \tparam T                    <b>[inferred]</b> Data type of output value
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            OutputIteratorT,
-    typename            T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated store iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadStore
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals)
-    {
-        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
-    }
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
-    {
-        ptr[COUNT] = vals[COUNT];
-        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
-    }
-
-};
-
-/// Helper structure for templated store iteration (termination case)
-template <int MAX>
-struct IterateThreadStore<MAX, MAX>
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y),                                                                     \
-            "r"(val.z),                                                                     \
-            "r"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val.x),                                                                     \
-            "l"(val.y));                                                                    \
-    }
-
-
-/**
- * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val.x),                                                                     \
-            "h"(val.y),                                                                     \
-            "h"(val.z),                                                                     \
-            "h"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val));                                                                      \
-    }
-
-/**
- * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
-    {                                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "   cvt.u8.u16 datum, %1;"                                                          \
-        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
-        "}" : :                                                                             \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"((unsigned short) val));                                                               \
-    }
-
-/**
- * Define powers-of-two ThreadStore specializations for the given Cache load modifier
- */
-#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
-    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
-    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
-
-
-/**
- * Define ThreadStore specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_STORE_ALL(STORE_WB, wb)
-    _CUB_STORE_ALL(STORE_CG, cg)
-    _CUB_STORE_ALL(STORE_CS, cs)
-    _CUB_STORE_ALL(STORE_WT, wt)
-#else
-    _CUB_STORE_ALL(STORE_WB, global)
-    _CUB_STORE_ALL(STORE_CG, global)
-    _CUB_STORE_ALL(STORE_CS, global)
-    _CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_STORE_ALL
-#undef _CUB_STORE_1
-#undef _CUB_STORE_2
-#undef _CUB_STORE_4
-#undef _CUB_STORE_8
-#undef _CUB_STORE_16
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on iterator types
- */
-template <typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(
-    OutputIteratorT             itr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<false>             /*is_pointer*/)
-{
-    *itr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    *ptr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<true>              /*is_primitive*/)
-{
-    *reinterpret_cast<volatile T*>(ptr) = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<false>             /*is_primitive*/)
-{
-    // Create a temporary using shuffle-words, then store using volatile-words
-    typedef typename UnitWord<T>::VolatileWord  VolatileWord;  
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-    
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_VOLATILE>    /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadStore definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<MODIFIER>          /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    // Create a temporary using shuffle-words, then store using device-words
-    typedef typename UnitWord<T>::DeviceWord    DeviceWord;  
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-    
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for generic modifiers
- */
-template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
-{
-    ThreadStore(
-        itr,
-        val,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_allocator.cuh b/external/cub/cub/util_allocator.cuh
deleted file mode 100644
index 24c7a79fee5..00000000000
--- a/external/cub/cub/util_allocator.cuh
+++ /dev/null
@@ -1,708 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-#include <set>
-#include <map>
-
-#include "host/mutex.cuh"
-#include <math.h>
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-/**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and stream-safe and is capable of managing cached
- * device allocations on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
- *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-struct CachingDeviceAllocator
-{
-
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int) -1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t) -1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor
-    {
-        void*           d_ptr;              // Device pointer
-        size_t          bytes;              // Size of allocation in bytes
-        unsigned int    bin;                // Bin enumeration
-        int             device;             // device ordinal
-        cudaStream_t    associated_stream;  // Associated associated_stream
-        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
-
-        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
-        BlockDescriptor(int device) :
-            d_ptr(NULL),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Comparison functor for comparing device pointers
-        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.d_ptr < b.d_ptr);
-            else
-                return (a.device < b.device);
-        }
-
-        // Comparison functor for comparing allocation sizes
-        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.bytes < b.bytes);
-            else
-                return (a.device < b.device);
-        }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-        size_t free;
-        size_t live;
-        TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
-
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(
-        unsigned int base,
-        unsigned int exp)
-    {
-        unsigned int retval = 1;
-        while (exp > 0)
-        {
-            if (exp & 1) {
-                retval = retval * base;        // multiply the result by the current base
-            }
-            base = base * base;                // square the base
-            exp = exp >> 1;                    // divide the exponent in half
-        }
-        return retval;
-    }
-
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(
-        unsigned int    &power,
-        size_t          &rounded_bytes,
-        unsigned int    base,
-        size_t          value)
-    {
-        power = 0;
-        rounded_bytes = 1;
-
-        if (value * base < value)
-        {
-            // Overflow
-            power = sizeof(size_t) * 8;
-            rounded_bytes = size_t(0) - 1;
-            return;
-        }
-
-        while (rounded_bytes < value)
-        {
-            rounded_bytes *= base;
-            power++;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex      mutex;              /// Mutex for thread-safety
-
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
-
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
-
-    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
-
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
-        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
-        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-    :
-        bin_growth(bin_growth),
-        min_bin(min_bin),
-        max_bin(max_bin),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes(max_cached_bytes),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(
-        bool skip_cleanup = false,
-        bool debug = false)
-    :
-        bin_growth(8),
-        min_bin(3),
-        max_bin(7),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes((max_bin_bytes * 3) - 1),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    cudaError_t SetMaxCachedBytes(
-        size_t max_cached_bytes)
-    {
-        // Lock
-        mutex.Lock();
-
-        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
-
-        this->max_cached_bytes = max_cached_bytes;
-
-        // Unlock
-        mutex.Unlock();
-
-        return cudaSuccess;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        int             device,             ///< [in] Device on which to place the allocation
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        *d_ptr                          = NULL;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            device = entrypoint_device;
-        }
-
-        // Create a block descriptor for the requested allocation
-        bool found = false;
-        BlockDescriptor search_key(device);
-        search_key.associated_stream = active_stream;
-        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-        if (search_key.bin > max_bin)
-        {
-            // Bin is greater than our maximum bin: allocate the request
-            // exactly and give out-of-bounds bin.  It will not be cached
-            // for reuse when returned.
-            search_key.bin      = INVALID_BIN;
-            search_key.bytes    = bytes;
-        }
-        else
-        {
-            // Search for a suitable cached allocation: lock
-            mutex.Lock();
-
-            if (search_key.bin < min_bin)
-            {
-                // Bin is less than minimum bin: round up
-                search_key.bin      = min_bin;
-                search_key.bytes    = min_bin_bytes;
-            }
-
-            // Iterate through the range of cached blocks on the same device in the same bin
-            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-            while ((block_itr != cached_blocks.end())
-                    && (block_itr->device == device)
-                    && (block_itr->bin == search_key.bin))
-            {
-                // To prevent races with reusing blocks returned by the host but still
-                // in use by the device, only consider cached blocks that are
-                // either (from the active stream) or (from an idle stream)
-                if ((active_stream == block_itr->associated_stream) ||
-                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
-                {
-                    // Reuse existing cache block.  Insert into live blocks.
-                    found = true;
-                    search_key = *block_itr;
-                    search_key.associated_stream = active_stream;
-                    live_blocks.insert(search_key);
-
-                    // Remove from free blocks
-                    cached_bytes[device].free -= search_key.bytes;
-                    cached_bytes[device].live += search_key.bytes;
-
-                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
-                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
-
-                    cached_blocks.erase(block_itr);
-
-                    break;
-                }
-                block_itr++;
-            }
-
-            // Done searching: unlock
-            mutex.Unlock();
-        }
-
-        // Allocate the block if necessary
-        if (!found)
-        {
-            // Set runtime's current device to specified device (entrypoint may not be set)
-            if (device != entrypoint_device)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-                if (CubDebug(error = cudaSetDevice(device))) return error;
-            }
-
-            // Attempt to allocate
-            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
-            {
-                // The allocation attempt failed: free all cached blocks on device and retry
-                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-                error = cudaSuccess;    // Reset the error we will return
-                cudaGetLastError();     // Reset CUDART's error
-
-                // Lock
-                mutex.Lock();
-
-                // Iterate the range of free blocks on the same device
-                BlockDescriptor free_key(device);
-                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
-
-                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
-                {
-                    // No need to worry about synchronization with the device: cudaFree is
-                    // blocking and will synchronize across all kernels executing
-                    // on the current device
-
-                    // Free device memory and destroy stream event.
-                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
-                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
-
-                    // Reduce balance and erase entry
-                    cached_bytes[device].free -= block_itr->bytes;
-
-                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-                    cached_blocks.erase(block_itr);
-
-                    block_itr++;
-                }
-
-                // Unlock
-                mutex.Unlock();
-
-                // Return under error
-                if (error) return error;
-
-                // Try to allocate again
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
-            }
-
-            // Create ready event
-            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
-
-            // Insert into live blocks
-            mutex.Lock();
-            live_blocks.insert(search_key);
-            cached_bytes[device].live += search_key.bytes;
-            mutex.Unlock();
-
-            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
-                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-            // Attempt to revert back to previous device if necessary
-            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-            {
-                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-            }
-        }
-
-        // Copy device pointer to output parameter
-        *d_ptr = search_key.d_ptr;
-
-        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-        return error;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        int             device,
-        void*           d_ptr)
-    {
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-                return error;
-            device = entrypoint_device;
-        }
-
-        // Lock
-        mutex.Lock();
-
-        // Find corresponding block descriptor
-        bool recached = false;
-        BlockDescriptor search_key(d_ptr, device);
-        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-        if (block_itr != live_blocks.end())
-        {
-            // Remove from live blocks
-            search_key = *block_itr;
-            live_blocks.erase(block_itr);
-            cached_bytes[device].live -= search_key.bytes;
-
-            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
-            {
-                // Insert returned allocation into free blocks
-                recached = true;
-                cached_blocks.insert(search_key);
-                cached_bytes[device].free += search_key.bytes;
-
-                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
-                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-            }
-        }
-
-        // Unlock
-        mutex.Unlock();
-
-        // First set to specified device (entrypoint may not be set)
-        if (device != entrypoint_device)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            if (CubDebug(error = cudaSetDevice(device))) return error;
-        }
-
-        if (recached)
-        {
-            // Insert the ready event in the associated stream (must have current device set properly)
-            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
-        }
-        else
-        {
-            // Free the allocation from the runtime and cleanup the event.
-            if (CubDebug(error = cudaFree(d_ptr))) return error;
-            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-        }
-
-        // Reset device
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        void*           d_ptr)
-    {
-        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
-    }
-
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached()
-    {
-        cudaError_t error         = cudaSuccess;
-        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        int current_device        = INVALID_DEVICE_ORDINAL;
-
-        mutex.Lock();
-
-        while (!cached_blocks.empty())
-        {
-            // Get first block
-            CachedBlocks::iterator begin = cached_blocks.begin();
-
-            // Get entry-point device ordinal if necessary
-            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            }
-
-            // Set current device ordinal if necessary
-            if (begin->device != current_device)
-            {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
-                current_device = begin->device;
-            }
-
-            // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
-
-            // Reduce balance and erase entry
-            cached_bytes[current_device].free -= begin->bytes;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
-
-            cached_blocks.erase(begin);
-        }
-
-        mutex.Unlock();
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator()
-    {
-        if (!skip_cleanup)
-            FreeAllCached();
-    }
-
-};
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_arch.cuh b/external/cub/cub/util_arch.cuh
deleted file mode 100644
index 5ec36e5f1f7..00000000000
--- a/external/cub/cub/util_arch.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Static architectural properties by SM version.
- */
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
-    #define CUB_USE_COOPERATIVE_GROUPS
-#endif
-
-/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
-#ifndef CUB_PTX_ARCH
-    #ifndef __CUDA_ARCH__
-        #define CUB_PTX_ARCH 0
-    #else
-        #define CUB_PTX_ARCH __CUDA_ARCH__
-    #endif
-#endif
-
-
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#ifndef CUB_RUNTIME_FUNCTION
-    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
-        #define CUB_RUNTIME_ENABLED
-        #define CUB_RUNTIME_FUNCTION __host__ __device__
-    #else
-        #define CUB_RUNTIME_FUNCTION __host__
-    #endif
-#endif
-
-
-/// Number of threads per warp
-#ifndef CUB_LOG_WARP_THREADS
-    #define CUB_LOG_WARP_THREADS(arch)                      \
-        (5)
-    #define CUB_WARP_THREADS(arch)                          \
-        (1 << CUB_LOG_WARP_THREADS(arch))
-
-    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
-    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
-#endif
-
-
-/// Number of smem banks
-#ifndef CUB_LOG_SMEM_BANKS
-    #define CUB_LOG_SMEM_BANKS(arch)                        \
-        ((arch >= 200) ?                                    \
-            (5) :                                           \
-            (4))
-    #define CUB_SMEM_BANKS(arch)                            \
-        (1 << CUB_LOG_SMEM_BANKS(arch))
-
-    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
-    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
-#endif
-
-
-/// Oversubscription factor
-#ifndef CUB_SUBSCRIPTION_FACTOR
-    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
-        ((arch >= 300) ?                                    \
-            (5) :                                           \
-            ((arch >= 200) ?                                \
-                (3) :                                       \
-                (10)))
-    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
-#endif
-
-
-/// Prefer padding overhead vs X-way conflicts greater than this threshold
-#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
-    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
-        ((arch >= 300) ?                                    \
-            (1) :                                           \
-            (4))
-    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
-#endif
-
-
-/// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
-#ifndef CUB_BLOCK_THREADS
-    #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
-        (CUB_MIN(                                                                           \
-            NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
-            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
-                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
-                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
-#endif
-
-/// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
-#ifndef CUB_ITEMS_PER_THREAD
-    #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
-	    (CUB_MIN(                                                                                       \
-	        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                            \
-	        CUB_MAX(                                                                                    \
-	            1,                                                                                      \
-	            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
-#endif
-
-/// Define both nominal threads-per-block and items-per-thread
-#ifndef CUB_NOMINAL_CONFIG
-    #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)    \
-        CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                \
-        CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
-#endif
-
-
-
-#endif  // Do not document
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_debug.cuh b/external/cub/cub/util_debug.cuh
deleted file mode 100644
index 1ad60cf2db6..00000000000
--- a/external/cub/cub/util_debug.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Error and event logging routines.
- *
- * The following macros definitions are supported:
- * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/// CUB error reporting macro (prints error messages to stderr)
-#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
-    #define CUB_STDERR
-#endif
-
-
-
-/**
- * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
-{
-    (void)filename;
-    (void)line;
-#ifdef CUB_STDERR
-    if (error)
-    {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-    #endif
-    }
-#endif
-    return error;
-}
-
-
-/**
- * \brief Debug macro
- */
-#ifndef CubDebug
-    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
-#endif
-
-
-/**
- * \brief Debug macro with exit
- */
-#ifndef CubDebugExit
-    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
-#endif
-
-
-/**
- * \brief Log macro for printf statements.
- */
-#if !defined(_CubLog)
-    #if !(defined(__clang__) && defined(__CUDA__))
-        #if (CUB_PTX_ARCH == 0)
-            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
-        #elif (CUB_PTX_ARCH >= 200)
-            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
-        #endif
-    #else
-        // XXX shameless hack for clang around variadic printf...
-        //     Compilies w/o supplying -std=c++11 but shows warning,
-        //     so we sielence them :)
-        #pragma clang diagnostic ignored "-Wc++11-extensions"
-        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
-            template <class... Args>
-            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
-            {
-        #ifdef __CUDA_ARCH__
-              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
-        #else
-              printf(format, args...);
-        #endif
-            }
-        #ifndef __CUDA_ARCH__
-            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
-        #else
-            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
-        #endif
-    #endif
-#endif
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_device.cuh b/external/cub/cub/util_device.cuh
deleted file mode 100644
index fa73dbd74f1..00000000000
--- a/external/cub/cub/util_device.cuh
+++ /dev/null
@@ -1,347 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
-template <int ALLOCATIONS>
-__host__ __device__ __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
-{
-    const int ALIGN_BYTES   = 256;
-    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
-
-    // Compute exclusive prefix sum over allocation requests
-    size_t allocation_offsets[ALLOCATIONS];
-    size_t bytes_needed = 0;
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-        allocation_offsets[i] = bytes_needed;
-        bytes_needed += allocation_bytes;
-    }
-    bytes_needed += ALIGN_BYTES - 1;
-
-    // Check if the caller is simply requesting the size of the storage allocation
-    if (!d_temp_storage)
-    {
-        temp_storage_bytes = bytes_needed;
-        return cudaSuccess;
-    }
-
-    // Check if enough storage provided
-    if (temp_storage_bytes < bytes_needed)
-    {
-        return CubDebug(cudaErrorInvalidValue);
-    }
-
-    // Alias
-    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
-template <typename T>
-__global__ void EmptyKernel(void) { }
-
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
-{
-    struct Dummy
-    {
-        /// Type definition of the EmptyKernel kernel entry point
-        typedef void (*EmptyKernelPtr)();
-
-        /// Force EmptyKernel<void> to be generated if this class is used
-        CUB_RUNTIME_FUNCTION __forceinline__
-        EmptyKernelPtr Empty()
-        {
-            return EmptyKernel<void>;
-        }
-    };
-
-
-#ifndef CUB_RUNTIME_ENABLED
-    (void)ptx_version;
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#elif (CUB_PTX_ARCH > 0)
-
-    ptx_version = CUB_PTX_ARCH;
-    return cudaSuccess;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        cudaFuncAttributes empty_kernel_attrs;
-        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
-        ptx_version = empty_kernel_attrs.ptxVersion * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-/**
- * \brief Retrieves the SM version (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
-{
-#ifndef CUB_RUNTIME_ENABLED
-    (void)sm_version;
-    (void)device_ordinal;
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        // Fill in SM version
-        int major, minor;
-        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
-        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
-        sm_version = major * 100 + minor * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Synchronize the stream if specified
- */
-CUB_RUNTIME_FUNCTION __forceinline__
-static cudaError_t SyncStream(cudaStream_t stream)
-{
-#if (CUB_PTX_ARCH == 0)
-    return cudaStreamSynchronize(stream);
-#else
-    (void)stream;
-    // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
-#endif
-}
-
-
-/**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
- *
- * \par Snippet
- * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
- *
- * template <typename T>
- * __global__ void ExampleKernel()
- * {
- *     // Allocate shared memory for BlockScan
- *     __shared__ volatile T buffer[4096];
- *
- *        ...
- * }
- *
- *     ...
- *
- * // Determine SM occupancy for ExampleKernel specialized for unsigned char
- * int max_sm_occupancy;
- * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
- *
- * // max_sm_occupancy  <-- 4 on SM10
- * // max_sm_occupancy  <-- 8 on SM20
- * // max_sm_occupancy  <-- 12 on SM35
- *
- * \endcode
- *
- */
-template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t MaxSmOccupancy(
-    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads,              ///< [in] Number of threads per thread block
-    int                 dynamic_smem_bytes = 0)
-{
-#ifndef CUB_RUNTIME_ENABLED
-    (void)dynamic_smem_bytes;
-    (void)block_threads;
-    (void)kernel_ptr;
-    (void)max_sm_occupancy;
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
-    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
-        &max_sm_occupancy,
-        kernel_ptr,
-        block_threads,
-        dynamic_smem_bytes);
-
-#endif  // CUB_RUNTIME_ENABLED
-}
-
-
-/******************************************************************************
- * Policy management
- ******************************************************************************/
-
-/**
- * Kernel dispatch configuration
- */
-struct KernelConfig
-{
-    int block_threads;
-    int items_per_thread;
-    int tile_size;
-    int sm_occupancy;
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
-
-    template <typename AgentPolicyT, typename KernelPtrT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Init(KernelPtrT kernel_ptr)
-    {
-        block_threads        = AgentPolicyT::BLOCK_THREADS;
-        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
-        tile_size            = block_threads * items_per_thread;
-        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
-        return retval;
-    }
-};
-
-
-
-/// Helper for dispatching into a policy chain
-template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
-struct ChainedPolicy
-{
-   /// The policy for the active compiler pass
-   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
-
-   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-   template <typename FunctorT>
-   CUB_RUNTIME_FUNCTION __forceinline__
-   static cudaError_t Invoke(int ptx_version, FunctorT &op)
-   {
-       if (ptx_version < PTX_VERSION) {
-           return PrevPolicyT::Invoke(ptx_version, op);
-       }
-       return op.template Invoke<PolicyT>();
-   }
-};
-
-/// Helper for dispatching into a policy chain (end-of-chain specialization)
-template <int PTX_VERSION, typename PolicyT>
-struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
-{
-    /// The policy for the active compiler pass
-    typedef PolicyT ActivePolicy;
-
-    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-    template <typename FunctorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
-        return op.template Invoke<PolicyT>();
-    }
-};
-
-
-
-
-#endif  // Do not document
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_macro.cuh b/external/cub/cub/util_macro.cuh
deleted file mode 100644
index 73c29d22c5c..00000000000
--- a/external/cub/cub/util_macro.cuh
+++ /dev/null
@@ -1,103 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Common C/C++ macro utilities
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-#ifndef CUB_ALIGN
-    #if defined(_WIN32) || defined(_WIN64)
-        /// Align struct
-        #define CUB_ALIGN(bytes) __declspec(align(32))
-    #else
-        /// Align struct
-        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
-    #endif
-#endif
-
-#ifndef CUB_MAX
-    /// Select maximum(a, b)
-    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_MIN
-    /// Select minimum(a, b)
-    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_QUOTIENT_FLOOR
-    /// Quotient of x/y rounded down to nearest integer
-    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
-#endif
-
-#ifndef CUB_QUOTIENT_CEILING
-    /// Quotient of x/y rounded up to nearest integer
-    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#ifndef CUB_ROUND_UP_NEAREST
-    /// x rounded up to the nearest multiple of y
-    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
-#endif
-
-#ifndef CUB_ROUND_DOWN_NEAREST
-    /// x rounded down to the nearest multiple of y
-    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
-#endif
-
-
-#ifndef CUB_STATIC_ASSERT
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-        #define CUB_CAT_(a, b) a ## b
-        #define CUB_CAT(a, b) CUB_CAT_(a, b)
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// Static assert
-    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
-#endif
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_namespace.cuh b/external/cub/cub/util_namespace.cuh
deleted file mode 100644
index edb61260669..00000000000
--- a/external/cub/cub/util_namespace.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Place-holder for prefixing the cub namespace
- */
-
-#pragma once
-
-// For example:
-//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define CUB_NS_POSTFIX } }
-
-#ifndef CUB_NS_PREFIX
-#define CUB_NS_PREFIX
-#endif
-
-#ifndef CUB_NS_POSTFIX
-#define CUB_NS_POSTFIX
-#endif
diff --git a/external/cub/cub/util_ptx.cuh b/external/cub/cub/util_ptx.cuh
deleted file mode 100644
index fae6e4fae2e..00000000000
--- a/external/cub/cub/util_ptx.cuh
+++ /dev/null
@@ -1,729 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * PTX intrinsics
- */
-
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilPtx
- * @{
- */
-
-
-/******************************************************************************
- * PTX helper macros
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Register modifier for pointer-types (for inlining PTX assembly)
- */
-#if defined(_WIN64) || defined(__LP64__)
-    #define __CUB_LP64__ 1
-    // 64-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "l"
-    #define _CUB_ASM_PTR_SIZE_ "u64"
-#else
-    #define __CUB_LP64__ 0
-    // 32-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "r"
-    #define _CUB_ASM_PTR_SIZE_ "u32"
-#endif
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Inlined PTX intrinsics
- ******************************************************************************/
-
-/**
- * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHR_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
-    return ret;
-}
-
-
-/**
- * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHL_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Bitfield-extract.
- */
-template <typename UnsignedBits, int BYTE_LEN>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<BYTE_LEN>      /*byte_len*/)
-{
-    unsigned int bits;
-#if CUB_PTX_ARCH >= 200
-    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
-    return bits;
-}
-
-
-/**
- * Bitfield-extract for 64-bit types.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<8>             /*byte_len*/)
-{
-    const unsigned long long MASK = (1ull << num_bits) - 1;
-    return (source >> bit_start) & MASK;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
-}
-
-
-/**
- * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
- */
-__device__ __forceinline__ void BFI(
-    unsigned int &ret,
-    unsigned int x,
-    unsigned int y,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-#if CUB_PTX_ARCH >= 200
-    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    x <<= bit_start;
-    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
-    unsigned int MASK_Y = ~MASK_X;
-    ret = (y & MASK_Y) | (x & MASK_X);
-#endif
-}
-
-
-/**
- * \brief Three-operand add.  Returns \p x + \p y + \p z.
- */
-__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-#if CUB_PTX_ARCH >= 200
-    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
-    return x;
-}
-
-
-/**
- * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
- *
- * \par
- * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
- * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
- * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
- * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
- *
- * \par Snippet
- * The code snippet below illustrates byte-permute.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     int a        = 0x03020100;
- *     int b        = 0x07060504;
- *     int index    = 0x00007531;
- *
- *     int selected = PRMT(a, b, index);    // 0x07050301
- *
- * \endcode
- *
- */
-__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-    int ret;
-    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Sync-threads barrier.
- */
-__device__ __forceinline__ void BAR(int count)
-{
-    asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-/**
- * CTA barrier
- */
-__device__  __forceinline__ void CTA_SYNC()
-{
-    __syncthreads();
-}
-
-
-/**
- * CTA barrier with predicate
- */
-__device__  __forceinline__ int CTA_SYNC_AND(int p)
-{
-    return __syncthreads_and(p);
-}
-
-
-/**
- * Warp barrier
- */
-__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    __syncwarp(member_mask);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __any_sync(member_mask, predicate);
-#else
-    return ::__any(predicate);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __all_sync(member_mask, predicate);
-#else
-    return ::__all(predicate);
-#endif
-}
-
-
-/**
- * Warp ballot
- */
-__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __ballot_sync(member_mask, predicate);
-#else
-    return __ballot(predicate);
-#endif
-}
-
-/**
- * Warp synchronous shfl_up
- */
-__device__ __forceinline__ 
-unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_down
- */
-__device__ __forceinline__ 
-unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_idx
- */
-__device__ __forceinline__ 
-unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane));
-#endif
-    return word;
-}
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FMUL_RZ(float a, float b)
-{
-    float d;
-    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-    return d;
-}
-
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
-{
-    float d;
-    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-    return d;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Terminates the calling thread
- */
-__device__ __forceinline__ void ThreadExit() {
-    asm volatile("exit;");
-}    
-
-
-/**
- * \brief  Abort execution and generate an interrupt to the host CPU
- */
-__device__ __forceinline__ void ThreadTrap() {
-    asm volatile("trap;");
-}
-
-
-/**
- * \brief Returns the row-major linear thread identifier for a multidimensional thread block
- */
-__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
-{
-    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
-            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
-            threadIdx.x;
-}
-
-
-/**
- * \brief Returns the warp lane ID of the calling thread
- */
-__device__ __forceinline__ unsigned int LaneId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
-    return ret;
-}
-
-
-/**
- * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
- */
-__device__ __forceinline__ unsigned int WarpId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
-    return ret;
-}
-
-/** @} */       // end group UtilPtx
-
-
-
-
-/**
- * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * predecessor of its predecessor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
-    int             first_lane,         ///< [in] Index of first lane in segment (typically 0)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
- 
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * successor of its successor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleDown(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
-    int             last_lane,          ///< [in] Index of first lane in segment (typically 31)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
- * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
- * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
- *
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
- *
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from thread 0
- *     double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleIndex(
-    T               input,                  ///< [in] The value to broadcast
-    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
-    int             logical_warp_threads,   ///< [in] Number of threads per logical warp
-    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
-                                 src_lane,
-                                 logical_warp_threads - 1,
-                                 member_mask);
-
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
-                                     src_lane,
-                                     logical_warp_threads - 1,
-                                     member_mask);
-
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-
-/**
- * Compute a 32b mask of threads having the same least-significant
- * LABEL_BITS of \p label as the calling thread.
- */
-template <int LABEL_BITS>
-inline __device__ unsigned int MatchAny(unsigned int label)
-{
-    unsigned int retval;
-
-    // Extract masks of common threads for each bit
-    #pragma unroll
-    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
-    {
-        unsigned int mask;
-        unsigned int current_bit = 1 << BIT;
-        asm ("{\n"
-            "    .reg .pred p;\n"
-            "    and.b32 %0, %1, %2;"
-            "    setp.eq.u32 p, %0, %2;\n"
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
-#else
-            "    vote.ballot.b32 %0, p;\n"
-#endif
-            "    @!p not.b32 %0, %0;\n"
-            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
-
-        // Remove peers who differ
-        retval = (BIT == 0) ? mask : retval & mask;
-    }
-
-    return retval;
-
-//  // VOLTA match
-//    unsigned int retval;
-//    asm ("{\n"
-//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
-//         "}\n" : "=r"(retval) : "r"(label));
-//    return retval;
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/util_type.cuh b/external/cub/cub/util_type.cuh
deleted file mode 100644
index 7de5427fa7a..00000000000
--- a/external/cub/cub/util_type.cuh
+++ /dev/null
@@ -1,1141 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Common type manipulation (metaprogramming) utilities
- */
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-#include <cfloat>
-
-#include "util_macro.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
-/******************************************************************************
- * Type equality
- ******************************************************************************/
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Conditional types
- ******************************************************************************/
-
-/**
- * \brief Type equality test
- */
-template <typename A, typename B>
-struct Equals
-{
-    enum {
-        VALUE = 0,
-        NEGATE = 1
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename A>
-struct Equals <A, A>
-{
-    enum {
-        VALUE = 1,
-        NEGATE = 0
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Static math
- ******************************************************************************/
-
-/**
- * \brief Statically determine log2(N), rounded up.
- *
- * For example:
- *     Log2<8>::VALUE   // 3
- *     Log2<3>::VALUE   // 2
- */
-template <int N, int CURRENT_VAL = N, int COUNT = 0>
-struct Log2
-{
-    /// Static logarithm value
-    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <int N, int COUNT>
-struct Log2<N, 0, COUNT>
-{
-    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
-        COUNT :
-        COUNT - 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Statically determine if N is a power-of-two
- */
-template <int N>
-struct PowerOfTwo
-{
-    enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-
-
-/******************************************************************************
- * Pointer vs. iterator detection
- ******************************************************************************/
-
-/**
- * \brief Pointer vs. iterator
- */
-template <typename Tp>
-struct IsPointer
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsPointer<Tp*>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Qualifier detection
- ******************************************************************************/
-
-/**
- * \brief Volatile modifier test
- */
-template <typename Tp>
-struct IsVolatile
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsVolatile<Tp volatile>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Qualifier removal
- ******************************************************************************/
-
-/**
- * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
- *
- * For example:
- *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
- */
-template <typename Tp, typename Up = Tp>
-struct RemoveQualifiers
-{
-    /// Type without \p const and \p volatile qualifiers
-    typedef Up Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, volatile Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const volatile Up>
-{
-    typedef Up Type;
-};
-
-
-/******************************************************************************
- * Marker types
- ******************************************************************************/
-
-/**
- * \brief A simple "NULL" marker type
- */
-struct NullType
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <typename T>
-    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
-
-    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
-
-    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-};
-
-
-/**
- * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
- */
-template <int A>
-struct Int2Type
-{
-   enum {VALUE = A};
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/******************************************************************************
- * Size and alignment
- ******************************************************************************/
-
-/// Structure alignment
-template <typename T>
-struct AlignBytes
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The "true CUDA" alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-
-    /// The "truly aligned" type
-    typedef T Type;
-};
-
-// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
-// with device C++ compilers (EDG) on types passed as template parameters through
-// kernel functions
-
-#define __CUB_ALIGN_BYTES(t, b)         \
-    template <> struct AlignBytes<t>    \
-    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
-
-__CUB_ALIGN_BYTES(short4, 8)
-__CUB_ALIGN_BYTES(ushort4, 8)
-__CUB_ALIGN_BYTES(int2, 8)
-__CUB_ALIGN_BYTES(uint2, 8)
-__CUB_ALIGN_BYTES(long long, 8)
-__CUB_ALIGN_BYTES(unsigned long long, 8)
-__CUB_ALIGN_BYTES(float2, 8)
-__CUB_ALIGN_BYTES(double, 8)
-#ifdef _WIN32
-    __CUB_ALIGN_BYTES(long2, 8)
-    __CUB_ALIGN_BYTES(ulong2, 8)
-#else
-    __CUB_ALIGN_BYTES(long2, 16)
-    __CUB_ALIGN_BYTES(ulong2, 16)
-#endif
-__CUB_ALIGN_BYTES(int4, 16)
-__CUB_ALIGN_BYTES(uint4, 16)
-__CUB_ALIGN_BYTES(float4, 16)
-__CUB_ALIGN_BYTES(long4, 16)
-__CUB_ALIGN_BYTES(ulong4, 16)
-__CUB_ALIGN_BYTES(longlong2, 16)
-__CUB_ALIGN_BYTES(ulonglong2, 16)
-__CUB_ALIGN_BYTES(double2, 16)
-__CUB_ALIGN_BYTES(longlong4, 16)
-__CUB_ALIGN_BYTES(ulonglong4, 16)
-__CUB_ALIGN_BYTES(double4, 16)
-
-template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
-
-
-/// Unit-words of data movement
-template <typename T>
-struct UnitWord
-{
-    enum {
-        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-    };
-
-    template <typename Unit>
-    struct IsMultiple
-    {
-        enum {
-            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
-        };
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
-        unsigned int,
-        typename If<IsMultiple<short>::IS_MULTIPLE,
-            unsigned short,
-            unsigned char>::Type>::Type         ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
-        unsigned long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
-        ulonglong2,
-        VolatileWord>::Type                     DeviceWord;
-
-    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
-        uint4,
-        typename If<IsMultiple<int2>::IS_MULTIPLE,
-            uint2,
-            ShuffleWord>::Type>::Type           TextureWord;
-};
-
-
-// float2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float2>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float       VolatileWord;
-    typedef uint2       DeviceWord;
-#else
-    typedef unsigned long long   VolatileWord;
-    typedef unsigned long long   DeviceWord;
-#endif
-    typedef float2      TextureWord;
-};
-
-// float4 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float4>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float               VolatileWord;
-    typedef uint4               DeviceWord;
-#else
-    typedef unsigned long long  VolatileWord;
-    typedef ulonglong2          DeviceWord;
-#endif
-    typedef float4              TextureWord;
-};
-
-
-// char2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <char2>
-{
-    typedef unsigned short      ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef unsigned short      VolatileWord;
-    typedef short               DeviceWord;
-#else
-    typedef unsigned short      VolatileWord;
-    typedef unsigned short      DeviceWord;
-#endif
-    typedef unsigned short      TextureWord;
-};
-
-
-template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Vector type inference utilities.
- ******************************************************************************/
-
-/**
- * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
- */
-template <typename T, int vec_elements> struct CubVector;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-enum
-{
-    /// The maximum number of elements in CUDA vector types
-    MAX_VEC_ELEMENTS = 4,
-};
-
-
-/**
- * Generic vector-1 type
- */
-template <typename T>
-struct CubVector<T, 1>
-{
-    T x;
-
-    typedef T BaseType;
-    typedef CubVector<T, 1> Type;
-};
-
-/**
- * Generic vector-2 type
- */
-template <typename T>
-struct CubVector<T, 2>
-{
-    T x;
-    T y;
-
-    typedef T BaseType;
-    typedef CubVector<T, 2> Type;
-};
-
-/**
- * Generic vector-3 type
- */
-template <typename T>
-struct CubVector<T, 3>
-{
-    T x;
-    T y;
-    T z;
-
-    typedef T BaseType;
-    typedef CubVector<T, 3> Type;
-};
-
-/**
- * Generic vector-4 type
- */
-template <typename T>
-struct CubVector<T, 4>
-{
-    T x;
-    T y;
-    T z;
-    T w;
-
-    typedef T BaseType;
-    typedef CubVector<T, 4> Type;
-};
-
-
-/**
- * Macro for expanding partially-specialized built-in vector types
- */
-#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
-                                                                                                        \
-    template<> struct CubVector<base_type, 1> : short_type##1                                           \
-    {                                                                                                   \
-      typedef base_type       BaseType;                                                                 \
-      typedef short_type##1   Type;                                                                     \
-      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x + other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x - other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 2> : short_type##2                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##2   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 3> : short_type##3                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##3   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 4> : short_type##4                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##4   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            retval.w = w + other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            retval.w = w - other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };
-
-
-
-// Expand CUDA vector types for built-in primitives
-CUB_DEFINE_VECTOR_TYPE(char,               char)
-CUB_DEFINE_VECTOR_TYPE(signed char,        char)
-CUB_DEFINE_VECTOR_TYPE(short,              short)
-CUB_DEFINE_VECTOR_TYPE(int,                int)
-CUB_DEFINE_VECTOR_TYPE(long,               long)
-CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
-CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-CUB_DEFINE_VECTOR_TYPE(float,              float)
-CUB_DEFINE_VECTOR_TYPE(double,             double)
-CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
-
-// Undefine macros
-#undef CUB_DEFINE_VECTOR_TYPE
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Wrapper types
- ******************************************************************************/
-
-/**
- * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
- */
-template <typename T>
-struct Uninitialized
-{
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        WORDS = sizeof(T) / sizeof(DeviceWord)
-    };
-
-    /// Backing storage
-    DeviceWord storage[WORDS];
-
-    /// Alias
-    __host__ __device__ __forceinline__ T& Alias()
-    {
-        return reinterpret_cast<T&>(*this);
-    }
-};
-
-
-/**
- * \brief A key identifier paired with a corresponding value
- */
-template <
-    typename    _Key,
-    typename    _Value
-#if defined(_WIN32) && !defined(_WIN64)
-    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
-    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-    >
-struct KeyValuePair
-{
-    typedef _Key    Key;                ///< Key data type
-    typedef _Value  Value;              ///< Value data type
-
-    Key     key;                        ///< Item key
-    Value   value;                      ///< Item value
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#if defined(_WIN32) && !defined(_WIN64)
-
-/**
- * Win32 won't do 16B alignment.  This can present two problems for
- * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
- * 1) If a smaller-aligned item were to be listed first, the host compiler places the
- *    should-be-16B item at too early an offset (and disagrees with device compiler)
- * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
- *    of the struct wrong (and disagrees with device compiler)
- *
- * So we put the larger-should-be-aligned item first, and explicitly pad the
- * end of the struct
- */
-
-/// Smaller key specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, true, false>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
-
-    Value   value;  // Value has larger would-be alignment and goes first
-    Key     key;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-
-/// Smaller value specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, false, true>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
-
-    Key     key;    // Key has larger would-be alignment and goes first
-    Value   value;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * \brief A wrapper for passing simple static arrays as kernel parameters
- */
-template <typename T, int COUNT>
-struct ArrayWrapper
-{
-
-    /// Statically-sized array of type \p T
-    T array[COUNT];
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArrayWrapper() {}
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
- *
- * Many multi-pass computations require a pair of "ping-pong" storage
- * buffers (e.g., one for reading from and the other for writing to, and then
- * vice-versa for the subsequent pass).  This structure wraps a set of device
- * buffers and a "selector" member to track which is "current".
- */
-template <typename T>
-struct DoubleBuffer
-{
-    /// Pair of device buffer pointers
-    T *d_buffers[2];
-
-    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-    int selector;
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer()
-    {
-        selector = 0;
-        d_buffers[0] = NULL;
-        d_buffers[1] = NULL;
-    }
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer(
-        T *d_current,         ///< The currently valid buffer
-        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
-    {
-        selector = 0;
-        d_buffers[0] = d_current;
-        d_buffers[1] = d_alternate;
-    }
-
-    /// \brief Return pointer to the currently valid buffer
-    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
-
-    /// \brief Return pointer to the currently invalid buffer
-    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
-
-};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-
-/**
- * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
- */
-#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
-    template <typename T>                                               \
-    struct detector_name                                                \
-    {                                                                   \
-        template <typename C>                                           \
-        static char& test(typename C::nested_type_name*);               \
-        template <typename>                                             \
-        static int& test(...);                                          \
-        enum                                                            \
-        {                                                               \
-            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
-        };                                                              \
-    };
-
-
-
-/******************************************************************************
- * Simple enable-if (similar to Boost)
- ******************************************************************************/
-
-/**
- * \brief Simple enable-if (similar to Boost)
- */
-template <bool Condition, class T = void>
-struct EnableIf
-{
-    /// Enable-if type for SFINAE dummy variables
-    typedef T Type;
-};
-
-
-template <class T>
-struct EnableIf<false, T> {};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-/**
- * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
- */
-template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
-{
-private:
-/*
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
-*/
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
-/*
-    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-*/
-    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static int Test(...);
-
-public:
-
-    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
-};
-
-
-
-
-/******************************************************************************
- * Simple type traits utilities.
- *
- * For example:
- *     Traits<int>::CATEGORY             // SIGNED_INTEGER
- *     Traits<NullType>::NULL_TYPE       // true
- *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
- *     Traits<uint4>::PRIMITIVE;         // false
- *
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
-enum Category
-{
-    NOT_A_NUMBER,
-    SIGNED_INTEGER,
-    UNSIGNED_INTEGER,
-    FLOATING_POINT
-};
-
-
-/**
- * \brief Basic type traits
- */
-template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
-struct BaseTraits
-{
-    /// Category
-    static const Category CATEGORY      = _CATEGORY;
-    enum
-    {
-        PRIMITIVE       = _PRIMITIVE,
-        NULL_TYPE       = _NULL_TYPE,
-    };
-};
-
-
-/**
- * Basic type traits (unsigned primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = UNSIGNED_INTEGER;
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-
-/**
- * Basic type traits (signed primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = SIGNED_INTEGER;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-template <typename _T>
-struct FpLimits;
-
-template <>
-struct FpLimits<float>
-{
-    static __host__ __device__ __forceinline__ float Max() {
-        return FLT_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ float Lowest() {
-        return FLT_MAX * float(-1);
-    }
-};
-
-template <>
-struct FpLimits<double>
-{
-    static __host__ __device__ __forceinline__ double Max() {
-        return DBL_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ double Lowest() {
-        return DBL_MAX  * double(-1);
-    }
-};
-
-
-/**
- * Basic type traits (fp primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = FLOATING_POINT;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
-        return key ^ mask;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
-        return key ^ mask;
-    };
-
-    static __host__ __device__ __forceinline__ T Max() {
-        return FpLimits<T>::Max();
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest() {
-        return FpLimits<T>::Lowest();
-    }
-};
-
-
-/**
- * \brief Numeric type traits
- */
-template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
-
-template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
-
-template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
-template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
-template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
-template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
-template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
-template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
-
-template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
-template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
-template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
-template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
-template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
-
-template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
-template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
-
-template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
-
-
-
-/**
- * \brief Type traits
- */
-template <typename T>
-struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
deleted file mode 100644
index 682a5bfedc2..00000000000
--- a/external/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ /dev/null
@@ -1,551 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_type.cuh"
-#include "../../util_macro.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp reduction steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// Number of logical warps in a PTX warp
-        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
-    };
-
-    template <typename S>
-    struct IsInteger
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP, int WARPS>
-    struct LastLaneMask
-    {
-        enum {
-            BASE_MASK   = 1 << (LOGICAL_WARP_THREADS - 1),
-            MASK        = (LastLaneMask<WARP + 1, WARPS>::MASK << LOGICAL_WARP_THREADS) | BASE_MASK,
-        };
-    };
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP>
-    struct LastLaneMask<WARP, WARP>
-    {
-        enum {
-            MASK        = 1 << (LOGICAL_WARP_THREADS - 1),
-        };
-    };
-
-
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-
-    unsigned int lane_id;
-
-    unsigned int member_mask;
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceShfl(
-        TempStorage &/*temp_storage*/)
-    :
-        lane_id(LaneId()),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
-            0 : // arch-width subwarps need not be tiled within the arch-warp
-            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Reduction steps
-    //---------------------------------------------------------------------
-
-    /// Reduction (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int ReduceStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across fp32 types)
-    __device__ __forceinline__ float ReduceStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long ReduceStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across long long types)
-    __device__ __forceinline__ long long ReduceStep(
-        long long           input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across double types)
-    __device__ __forceinline__ double ReduceStep(
-        double              input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
-    template <typename ValueT, typename KeyT>
-    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
-        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int                                         last_lane,          ///< [in] Index of last lane in segment
-        int                                         offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<KeyT, ValueT> output;
-
-        KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask);
-        
-        output.key = input.key;
-        output.value = ReduceStep(
-            input.value, 
-            cub::Sum(), 
-            last_lane, 
-            offset, 
-            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key != other_key)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-
-    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
-    template <typename ValueT, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
-        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                           last_lane,          ///< [in] Index of last lane in segment
-        int                                           offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, ValueT> output;
-
-        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-    /// Reduction step (generic)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T                  input,              ///< [in] Calling thread's input item.
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        _T output = input;
-
-        _T temp = ShuffleDown(output, offset, last_lane, member_mask);
-
-        // Perform reduction op if valid
-        if (offset + lane_id <= last_lane)
-            output = reduction_op(input, temp);
-
-        return output;
-    }
-
-
-    /// Reduction step (specialized for small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
-    //---------------------------------------------------------------------
-
-    template <typename ReductionOp, int STEP>
-    __device__ __forceinline__ void ReduceStep(
-        T&              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        Int2Type<STEP>  /*step*/)
-    {
-        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-
-        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
-    }
-
-    template <typename ReductionOp>
-    __device__ __forceinline__ void ReduceStep(
-        T&              /*input*/,              ///< [in] Calling thread's input item.
-        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int             /*last_lane*/,          ///< [in] Index of last lane in segment
-        Int2Type<STEPS> /*step*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Reduction operations
-    //---------------------------------------------------------------------
-
-    /// Reduction
-    template <
-        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
-    {
-        // Get the lane of the first and last thread in the logical warp
-        int first_thread   = 0;
-        int last_thread    = LOGICAL_WARP_THREADS - 1;
-        if (!IS_ARCH_WARP)
-        {
-            first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
-            last_thread |= lane_id;
-        }
-
-        // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
-        int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE;
-
-        // Get the last valid lane
-        int last_lane = (ALL_LANES_VALID) ?
-            last_thread :
-            CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        // Convert to tail-segmented
-        if (HEAD_SEGMENTED)
-            warp_flags >>= 1;
-
-        // Mask in the last lanes of each logical warp
-        warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK;
-
-        // Mask out the bits below the current thread
-        warp_flags &= LaneMaskGe();
-
-        // Find the next set flag
-        int last_lane = __clz(__brev(warp_flags));
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_reduce_smem.cuh b/external/cub/cub/warp/specializations/warp_reduce_smem.cuh
deleted file mode 100644
index 9ba8e94d12d..00000000000
--- a/external/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-
-        /// FlagT status (when not using ballot)
-        UNSET   = 0x0,  // Is initially unset
-        SET     = 0x1,  // Is initially set
-        SEEN    = 0x2,  // Has seen another head flag from a successor peer
-    };
-
-    /// Shared memory flag type
-    typedef unsigned char SmemFlag;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    struct _TempStorage
-    {
-        T           reduce[WARP_SMEM_ELEMENTS];
-        SmemFlag    flags[WARP_SMEM_ELEMENTS];
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Regular reduction
-    //---------------------------------------------------------------------
-
-    /**
-     * Reduction step
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp,
-        int                 STEP>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEP>      /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share input through buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-        WARP_SYNC(member_mask);
-
-        // Update input if peer_addend is in range
-        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
-        {
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-            input = reduction_op(input, peer_addend);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<STEP + 1>());
-    }
-
-
-    /**
-     * Reduction step (terminate)
-     */
-    template <
-        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,      ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                      ///< [in] Calling thread's input
-        int                 /*folded_items_per_warp*/,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
-        Int2Type<STEPS>     /*step*/)
-    {
-        return input;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Segmented reduction
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Ballot-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if (!IS_ARCH_WARP)
-        {
-            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
-        }
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Update input if peer_addend is in range
-            if (OFFSET + lane_id < next_flag)
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-
-            WARP_SYNC(member_mask);
-        }
-
-        return input;
-    }
-
-
-    /**
-     * Smem-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        enum
-        {
-            UNSET   = 0x0,  // Is initially unset
-            SET     = 0x1,  // Is initially set
-            SEEN    = 0x2,  // Has seen another head flag from a successor peer
-        };
-
-        // Alias flags onto shared data storage
-        volatile SmemFlag *flag_storage = temp_storage.flags;
-
-        SmemFlag flag_status = (flag) ? SET : UNSET;
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Get peer from buffer
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-
-            WARP_SYNC(member_mask);
-
-            // Share flag through buffer
-            flag_storage[lane_id] = flag_status;
-
-            // Get peer flag from buffer
-            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
-
-            // Update input if peer was in range
-            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-            {
-                if (HEAD_SEGMENTED)
-                {
-                    // Head-segmented
-                    if ((flag_status & SEEN) == 0)
-                    {
-                        // Has not seen a more distant head flag
-                        if (peer_flag_status & SET)
-                        {
-                            // Has now seen a head flag
-                            flag_status |= SEEN;
-                        }
-                        else
-                        {
-                            // Peer is not a head flag: grab its count
-                            input = reduction_op(input, peer_addend);
-                        }
-
-                        // Update seen status to include that of peer
-                        flag_status |= (peer_flag_status & SEEN);
-                    }
-                }
-                else
-                {
-                    // Tail-segmented.  Simply propagate flag status
-                    if (!flag_status)
-                    {
-                        input = reduction_op(input, peer_addend);
-                        flag_status |= peer_flag_status;
-                    }
-
-                }
-            }
-        }
-
-        return input;
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * Reduction
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
-    {
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<0>());
-    }
-
-
-    /**
-     * Segmented reduction
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
-    {
-        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_scan_shfl.cuh b/external/cub/cub/warp/specializations/warp_scan_shfl.cuh
deleted file mode 100644
index f0deb8ddefc..00000000000
--- a/external/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ /dev/null
@@ -1,656 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_type.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8,
-    };
-
-    template <typename S>
-    struct IntegerTraits
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    unsigned int lane_id;
-
-    unsigned int member_mask;
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanShfl(
-        TempStorage &/*temp_storage*/)
-    :
-        lane_id(LaneId()),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
-            0 : // arch-width subwarps need not be tiled within the arch-warp
-            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scan steps
-    //---------------------------------------------------------------------
-
-    /// Inclusive prefix scan step (specialized for summation across int32 types)
-    __device__ __forceinline__ int InclusiveScanStep(
-        int             input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-    /// Inclusive prefix scan step (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int InclusiveScanStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp32 types)
-    __device__ __forceinline__ float InclusiveScanStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long InclusiveScanStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across long long types)
-    __device__ __forceinline__ long long InclusiveScanStep(
-        long long       input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp64 types)
-    __device__ __forceinline__ double InclusiveScanStep(
-        double          input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-/*
-    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
-    template <typename Value, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
-        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
-        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
-        int                             first_lane,         ///< [in] Index of first lane in segment
-        int                             offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, Value> output;
-
-        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
-        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-*/
-
-    /// Inclusive prefix scan step (generic)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        _T temp = ShuffleUp(input, offset, first_lane, member_mask);
-
-        // Perform scan op if from a valid peer
-        _T output = scan_op(temp, input);
-        if (static_cast<int>(lane_id) < first_lane + offset)
-            output = input;
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-
-    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-    //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
-    //---------------------------------------------------------------------
-
-    template <typename _T, typename ScanOp, int STEP>
-    __device__ __forceinline__ void InclusiveScanStep(
-        _T&             input,              ///< [in] Calling thread's input item.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        Int2Type<STEP>  /*step*/)               ///< [in] Marker type indicating scan step
-    {
-        input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-
-        InclusiveScanStep(input, scan_op, first_lane, Int2Type<STEP + 1>());
-    }
-
-    template <typename _T, typename ScanOp>
-    __device__ __forceinline__ void InclusiveScanStep(
-        _T&             /*input*/,              ///< [in] Calling thread's input item.
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        int             /*first_lane*/,         ///< [in] Index of first lane in segment
-        Int2Type<STEPS> /*step*/)               ///< [in] Marker type indicating scan step
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        _T              input,              ///< [in] Calling thread's input item.
-        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        // Iterate scan steps
-        int segment_first_lane = 0;
-
-        // Iterate scan steps
-//        InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>());
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output = InclusiveScanStep(
-                inclusive_output,
-                scan_op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-
-    }
-
-    /// Inclusive scan, specialized for reduce-value-by-key
-    template <typename KeyT, typename ValueT, typename ReductionOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
-        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask);
-
-        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
-
-        // Mask away all lanes greater than ours
-        ballot = ballot & LaneMaskLe();
-
-        // Find index of first set bit
-        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
-
-        // Iterate scan steps
-//        InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>());
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output.value = InclusiveScanStep(
-                inclusive_output.value,
-                scan_op.op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,          ///< [in]
-        T                       &inclusive,         ///< [in, out]
-        T                       &exclusive,         ///< [out]
-        ScanOpT                 /*scan_op*/,        ///< [in]
-        IsIntegerT              /*is_integer*/)     ///< [in]
-    {
-        // initial value unknown
-        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
-
-        unsigned int segment_id = (IS_ARCH_WARP) ?
-            lane_id :
-            lane_id % LOGICAL_WARP_THREADS;
-
-        if (segment_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-        Update(input, inclusive, exclusive, scan_op, is_integer);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/specializations/warp_scan_smem.cuh b/external/cub/cub/warp/specializations/warp_scan_smem.cuh
deleted file mode 100644
index c3a7a94ba26..00000000000
--- a/external/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ /dev/null
@@ -1,397 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-    };
-
-    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
-    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        int         STEP,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &partial,
-        ScanOp                  scan_op,
-        Int2Type<STEP>          /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share partial into buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
-
-        WARP_SYNC(member_mask);
-
-        // Update partial if addend is in range
-        if (HAS_IDENTITY || (lane_id >= OFFSET))
-        {
-            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
-            partial = scan_op(addend, partial);
-        }
-        WARP_SYNC(member_mask);
-
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
-    }
-
-
-    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &/*partial*/,
-        ScanOp                  /*scan_op*/,
-        Int2Type<STEPS>         /*step*/)
-    {}
-
-
-    /// Inclusive prefix scan (specialized for summation across primitive types)
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        Sum                     scan_op,            ///< [in] Binary scan operator
-        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        T identity = 0;
-        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
-
-        WARP_SYNC(member_mask);
-
-        // Iterate scan steps
-        output = input;
-        ScanStep<true>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /// Inclusive prefix scan
-    template <typename ScanOp, int IS_PRIMITIVE>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp                  scan_op,            ///< [in] Binary scan operator
-        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        // Iterate scan steps
-        output = input;
-        ScanStep<false>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        if (lane_id == src_lane)
-        {
-            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Retrieve aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,      ///< [in]
-        T                       &inclusive,     ///< [in, out]
-        T                       &exclusive,     ///< [out]
-        ScanOpT                 /*scan_op*/,    ///< [in]
-        IsIntegerT              /*is_integer*/) ///< [in]
-    {
-        // initial value unknown
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 /*scan_op*/,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        cub::Sum                /*scan_o*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Broadcast warp aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-
-        // Update inclusive with initial value
-        inclusive = scan_op(initial_value, inclusive);
-
-        // Get exclusive from exclusive
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
-
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/warp_reduce.cuh b/external/cub/cub/warp/warp_reduce.cuh
deleted file mode 100644
index ef78dd6a009..00000000000
--- a/external/cub/cub/warp/warp_reduce.cuh
+++ /dev/null
@@ -1,612 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_reduce_shfl.cuh"
-#include "specializations/warp_reduce_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
- *
- * \tparam T                        The reduction input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic reduction)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpReduce}
- * \par
- * The code snippet below illustrates four concurrent warp sum reductions within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for 4 warps
- *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
- *     int warp_id = threadIdx.x / 32;
- *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
- * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
- * \p 2544, and \p 3568, respectively (and is undefined in other threads).
- *
- * \par
- * The code snippet below illustrates a single warp sum reduction within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for one warp
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a reduction
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Return the warp-wide sum to lane0
- *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
- * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-    };
-
-public:
-
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-private:
-
-    /// Shared memory storage layout type for WarpReduce
-    typedef typename InternalWarpReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias())
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp sum reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
-     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input)              ///< [in] Calling thread's input
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, cub::Sum());
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(
-     *         thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
-     * undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
-     *         thread_data, head_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return HeadSegmentedReduce(input, head_flag, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
-     *         thread_data, tail_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return TailSegmentedReduce(input, tail_flag, cub::Sum());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp max reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
-     *         thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
-     * \p 95, and \p 127, respectively  (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max(), valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
-     * undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
-     *         thread_data, head_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
-     *         thread_data, tail_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
-    }
-
-
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/cub/warp/warp_scan.cuh b/external/cub/cub/warp/warp_scan.cuh
deleted file mode 100644
index 3f78ca8a090..00000000000
--- a/external/cub/cub/warp/warp_scan.cuh
+++ /dev/null
@@ -1,936 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_scan_shfl.cuh"
-#include "specializations/warp_scan_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
- *
- * \tparam T                        The scan input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - Supports non-commutative scan operators
- * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic scan)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpScan}
- * \par
- * The code snippet below illustrates four concurrent warp prefix sums within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for 4 warps
- *     __shared__ typename WarpScan::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Compute warp-wide prefix sums
- *     int warp_id = threadIdx.x / 32;
- *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data in each of the four warps of threads will be
- * <tt>0, 1, 2, 3, ..., 31}</tt>.
- *
- * \par
- * The code snippet below illustrates a single warp prefix sum within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for one warp
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a prefix sum
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Compute warp-wide prefix sums
- *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// Whether the data type is an integer (which has fully-associative addition)
-        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
-    };
-
-    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
-
-    /// Shared memory storage layout type for WarpScan
-    typedef typename InternalWarpScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
-     *         thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Combination (inclusive & exclusive) prefix scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data exchange
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the warp-wide broadcasts of values from
-     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Broadcast from lane0 in each warp to all other threads in the warp
-     *     int warp_id = threadIdx.x / 32;
-     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p thread_data will be
-     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
-     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
-     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
-     */
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
-    }
-
-    //@}  end member group
-
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/external/cub/eclipse code style profile.xml b/external/cub/eclipse code style profile.xml
deleted file mode 100644
index 3ca7f771cc2..00000000000
--- a/external/cub/eclipse code style profile.xml	
+++ /dev/null
@@ -1,155 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<profiles version="1">
-<profile kind="CodeFormatterProfile" name="B40C" version="1">
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.lineSplit" value="80"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.tabulation.size" value="4"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_enumerator_list" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_declarator_list" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_empty_lines" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_method_declaration" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_type_declaration" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.continuation_indentation" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_expression_list" value="0"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_conditional_expression" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer" value="1"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_compact_if" value="0"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_body" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indent_statements_compare_to_block" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="48"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.tabulation.char" value="space"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.compact_else_if" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line" value="true"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.indentation.size" value="4"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration" value="end_of_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.brace_position_for_array_initializer" value="next_line"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters" value="insert"/>
-<setting id="org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments" value="do not insert"/>
-</profile>
-</profiles>
diff --git a/external/cub/examples/block/Makefile b/external/cub/examples/block/Makefile
deleted file mode 100644
index 753931b3407..00000000000
--- a/external/cub/examples/block/Makefile
+++ /dev/null
@@ -1,128 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-#-------------------------------------------------------------------------------
-#
-# Makefile usage
-#
-# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
-#
-#-------------------------------------------------------------------------------
- 
-include ../../common.mk 
- 
- 
-#-------------------------------------------------------------------------------
-# Includes
-#-------------------------------------------------------------------------------
-
-INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
-
-
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-DEPS =				$(CUB_DEPS) \
-					$(CUB_DIR)test/Makefile \
-					$(CUB_DIR)test/test_util.h \
-					$(CUB_DIR)test/mersenne.h \
-		
-ALL = 	example_block_radix_sort \
-	 	example_block_reduce \
-	 	example_block_scan
-		
-
-
-#-------------------------------------------------------------------------------
-# make default
-#-------------------------------------------------------------------------------
-
-default:
-
-
-#-------------------------------------------------------------------------------
-# make clean
-#-------------------------------------------------------------------------------
-
-clean :
-	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
-	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
-
-
-#-------------------------------------------------------------------------------
-# make all
-#-------------------------------------------------------------------------------
-
-all : $(ALL)
-
-#-------------------------------------------------------------------------------
-# make run
-#-------------------------------------------------------------------------------
-
-run : 
-	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
-
-
-
-
-#-------------------------------------------------------------------------------
-# make example_block_reduce
-#-------------------------------------------------------------------------------
-
-example_block_reduce: bin/example_block_reduce_$(BIN_SUFFIX)
-
-bin/example_block_reduce_$(BIN_SUFFIX) : example_block_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_reduce_$(BIN_SUFFIX) example_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_block_scan
-#-------------------------------------------------------------------------------
-
-example_block_scan: bin/example_block_scan_$(BIN_SUFFIX)
-
-bin/example_block_scan_$(BIN_SUFFIX) : example_block_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_scan_$(BIN_SUFFIX) example_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_block_radix_sort
-#-------------------------------------------------------------------------------
-
-example_block_radix_sort: bin/example_block_radix_sort_$(BIN_SUFFIX)
-
-bin/example_block_radix_sort_$(BIN_SUFFIX) : example_block_radix_sort.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_radix_sort_$(BIN_SUFFIX) example_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-	
diff --git a/external/cub/examples/block/example_block_radix_sort.cu b/external/cub/examples/block/example_block_radix_sort.cu
deleted file mode 100644
index 0bceb831ccf..00000000000
--- a/external/cub/examples/block/example_block_radix_sort.cu
+++ /dev/null
@@ -1,323 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockRadixSort
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_radix_sort.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-#include <algorithm>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_radix_sort.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-/// Uniform key samples
-bool g_uniform_keys;
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide sorting over integers
- */
-template <
-    typename    Key,
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD>
-__launch_bounds__ (BLOCK_THREADS)
-__global__ void BlockSortKernel(
-    Key         *d_in,          // Tile of input
-    Key         *d_out,         // Tile of output
-    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
-{
-    enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockLoad<Key, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
-
-    // Specialize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockLoadT::TempStorage        load;
-        typename BlockRadixSortT::TempStorage   sort;
-    } temp_storage;
-
-    // Per-thread tile items
-    Key items[ITEMS_PER_THREAD];
-
-    // Our current block's offset
-    int block_offset = blockIdx.x * TILE_SIZE;
-
-    // Load items into a blocked arrangement
-    BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Sort keys
-    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(items);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Store output in striped fashion
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-    // Store elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        d_elapsed[blockIdx.x] = (start > stop) ? start - stop : stop - start;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize sorting problem (and solution).
- */
-template <typename Key>
-void Initialize(
-    Key *h_in,
-    Key *h_reference,
-    int num_items,
-    int tile_size)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (g_uniform_keys)
-        {
-            h_in[i] = 0;
-        }
-        else
-        {
-            RandomBits(h_in[i]);
-        }
-        h_reference[i] = h_in[i];
-    }
-
-    // Only sort the first tile
-    std::sort(h_reference, h_reference + tile_size);
-}
-
-
-/**
- * Test BlockScan
- */
-template <
-    typename    Key,
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    Key *h_in               = new Key[TILE_SIZE * g_grid_size];
-    Key *h_reference        = new Key[TILE_SIZE * g_grid_size];
-    clock_t *h_elapsed      = new clock_t[g_grid_size];
-
-    // Initialize problem and reference output on host
-    Initialize(h_in, h_reference, TILE_SIZE * g_grid_size, TILE_SIZE);
-
-    // Initialize device arrays
-    Key *d_in       = NULL;
-    Key *d_out      = NULL;
-    clock_t *d_elapsed  = NULL;
-    CubDebugExit(cudaMalloc((void**)&d_in,          sizeof(Key) * TILE_SIZE * g_grid_size));
-    CubDebugExit(cudaMalloc((void**)&d_out,         sizeof(Key) * TILE_SIZE * g_grid_size));
-    CubDebugExit(cudaMalloc((void**)&d_elapsed,     sizeof(clock_t) * g_grid_size));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            std::cout << h_in[i] << ", ";
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD>, BLOCK_THREADS));
-
-    // Copy problem to device
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(Key) * TILE_SIZE * g_grid_size, cudaMemcpyHostToDevice));
-
-    printf("BlockRadixSort %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        TILE_SIZE * g_grid_size, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-    fflush(stdout);
-
-    // Run kernel once to prime caches and check result
-    BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    printf("\tOutput items: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    fflush(stdout);
-
-    // Run this several times and average the performance results
-    GpuTimer            timer;
-    float               elapsed_millis          = 0.0;
-    unsigned long long  elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        timer.Start();
-
-        // Run kernel
-        BlockSortKernel<Key, BLOCK_THREADS, ITEMS_PER_THREAD><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        CubDebugExit(cudaMemcpy(h_elapsed, d_elapsed, sizeof(clock_t) * g_grid_size, cudaMemcpyDeviceToHost));
-        for (int i = 0; i < g_grid_size; i++)
-            elapsed_clocks += h_elapsed[i];
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    double avg_clocks           = double(elapsed_clocks) / g_timing_iterations / g_grid_size;
-    double avg_clocks_per_item  = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockRadixSort::SortBlocked clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockRadixSort::SortBlocked clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-    fflush(stdout);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_elapsed) delete[] h_elapsed;
-    if (d_in) CubDebugExit(cudaFree(d_in));
-    if (d_out) CubDebugExit(cudaFree(d_out));
-    if (d_elapsed) CubDebugExit(cudaFree(d_elapsed));
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_uniform_keys = args.CheckCmdLineFlag("uniform");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations (default:%d)>]"
-            "[--grid-size=<grid size (default:%d)>]"
-            "[--v] "
-            "\n", argv[0], g_timing_iterations, g_grid_size);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    fflush(stdout);
-
-    // Run tests
-    printf("\nuint32:\n"); fflush(stdout);
-    Test<unsigned int, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    printf("\nfp32:\n"); fflush(stdout);
-    Test<float, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    printf("\nuint8:\n"); fflush(stdout);
-    Test<unsigned char, 128, 13>();
-    printf("\n"); fflush(stdout);
-
-    return 0;
-}
-
diff --git a/external/cub/examples/block/example_block_reduce.cu b/external/cub/examples/block/example_block_reduce.cu
deleted file mode 100644
index 8e30ef23296..00000000000
--- a/external/cub/examples/block/example_block_reduce.cu
+++ /dev/null
@@ -1,290 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockReduce
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_reduce.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_reduce.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide exclusive prefix sum over integers
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockReduceAlgorithm    ALGORITHM>
-__global__ void BlockSumKernel(
-    int         *d_in,          // Tile of input
-    int         *d_out,         // Tile aggregate
-    clock_t     *d_elapsed)     // Elapsed cycle count of block reduction
-{
-    // Specialize BlockReduce type for our thread block
-    typedef BlockReduce<int, BLOCK_THREADS, ALGORITHM> BlockReduceT;
-
-    // Shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    // Per-thread tile data
-    int data[ITEMS_PER_THREAD];
-    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in, data);
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Compute sum
-    int aggregate = BlockReduceT(temp_storage).Sum(data);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Store aggregate and elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-        *d_out = aggregate;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-/**
- * Initialize reduction problem (and solution).
- * Returns the aggregate
- */
-int Initialize(int *h_in, int num_items)
-{
-    int inclusive = 0;
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_in[i] = i % 17;
-        inclusive += h_in[i];
-    }
-
-    return inclusive;
-}
-
-
-/**
- * Test thread block reduction
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockReduceAlgorithm    ALGORITHM>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    int *h_in           = new int[TILE_SIZE];
-    int *h_gpu          = new int[TILE_SIZE + 1];
-
-    // Initialize problem and reference output on host
-    int h_aggregate = Initialize(h_in, TILE_SIZE);
-
-    // Initialize device arrays
-    int *d_in           = NULL;
-    int *d_out          = NULL;
-    clock_t *d_elapsed  = NULL;
-    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
-    cudaMalloc((void**)&d_out,         sizeof(int) * 1);
-    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            printf("%d, ", h_in[i]);
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
-
-    // Copy problem to device
-    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-    printf("BlockReduce algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-
-    // Run aggregate/prefix kernel
-    BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check total aggregate
-    printf("\tAggregate: ");
-    int compare = CompareDeviceResults(&h_aggregate, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Run this several times and average the performance results
-    GpuTimer    timer;
-    float       elapsed_millis          = 0.0;
-    clock_t     elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Copy problem to device
-        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-        timer.Start();
-
-        // Run aggregate/prefix kernel
-        BlockSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        clock_t clocks;
-        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
-        elapsed_clocks += clocks;
-
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
-    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockReduce::Sum clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockReduce::Sum clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_gpu) delete[] h_gpu;
-    if (d_in) cudaFree(d_in);
-    if (d_out) cudaFree(d_out);
-    if (d_elapsed) cudaFree(d_elapsed);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations>] "
-            "[--grid-size=<grid size>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run tests
-    Test<1024, 1, BLOCK_REDUCE_RAKING>();
-    Test<512, 2, BLOCK_REDUCE_RAKING>();
-    Test<256, 4, BLOCK_REDUCE_RAKING>();
-    Test<128, 8, BLOCK_REDUCE_RAKING>();
-    Test<64, 16, BLOCK_REDUCE_RAKING>();
-    Test<32, 32, BLOCK_REDUCE_RAKING>();
-    Test<16, 64, BLOCK_REDUCE_RAKING>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<512, 2, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<256, 4, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<128, 8, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<64, 16, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<32, 32, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    Test<16, 64, BLOCK_REDUCE_WARP_REDUCTIONS>();
-
-    return 0;
-}
-
diff --git a/external/cub/examples/block/example_block_scan.cu b/external/cub/examples/block/example_block_scan.cu
deleted file mode 100644
index 74729f8e95b..00000000000
--- a/external/cub/examples/block/example_block_scan.cu
+++ /dev/null
@@ -1,334 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple demonstration of cub::BlockScan
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console (define before including cub.h)
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/block/block_scan.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-/// Verbose output
-bool g_verbose = false;
-
-/// Timing iterations
-int g_timing_iterations = 100;
-
-/// Default grid size
-int g_grid_size = 1;
-
-
-
-//---------------------------------------------------------------------
-// Kernels
-//---------------------------------------------------------------------
-
-/**
- * Simple kernel for performing a block-wide exclusive prefix sum over integers
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockScanAlgorithm      ALGORITHM>
-__global__ void BlockPrefixSumKernel(
-    int         *d_in,          // Tile of input
-    int         *d_out,         // Tile of output
-    clock_t     *d_elapsed)     // Elapsed cycle count of block scan
-{
-    // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT;
-
-    // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement)
-    typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
-
-    // Specialize BlockScan type for our thread block
-    typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockLoadT::TempStorage    load;
-        typename BlockStoreT::TempStorage   store;
-        typename BlockScanT::TempStorage    scan;
-    } temp_storage;
-
-    // Per-thread tile data
-    int data[ITEMS_PER_THREAD];
-
-    // Load items into a blocked arrangement
-    BlockLoadT(temp_storage.load).Load(d_in, data);
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Compute exclusive prefix sum
-    int aggregate;
-    BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate);
-
-    // Stop cycle timer
-    clock_t stop = clock();
-
-    // Barrier for smem reuse
-    __syncthreads();
-
-    // Store items from a blocked arrangement
-    BlockStoreT(temp_storage.store).Store(d_out, data);
-
-    // Store aggregate and elapsed clocks
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-        d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate;
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utilities
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive prefix sum problem (and solution).
- * Returns the aggregate
- */
-int Initialize(
-    int *h_in,
-    int *h_reference,
-    int num_items)
-{
-    int inclusive = 0;
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_in[i] = i % 17;
-
-        h_reference[i] = inclusive;
-        inclusive += h_in[i];
-    }
-
-    return inclusive;
-}
-
-
-/**
- * Test thread block scan
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockScanAlgorithm  ALGORITHM>
-void Test()
-{
-    const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    int *h_in           = new int[TILE_SIZE];
-    int *h_reference    = new int[TILE_SIZE];
-    int *h_gpu          = new int[TILE_SIZE + 1];
-
-    // Initialize problem and reference output on host
-    int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE);
-
-    // Initialize device arrays
-    int *d_in           = NULL;
-    int *d_out          = NULL;
-    clock_t *d_elapsed  = NULL;
-    cudaMalloc((void**)&d_in,          sizeof(int) * TILE_SIZE);
-    cudaMalloc((void**)&d_out,         sizeof(int) * (TILE_SIZE + 1));
-    cudaMalloc((void**)&d_elapsed,     sizeof(clock_t));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-            printf("%d, ", h_in[i]);
-        printf("\n\n");
-    }
-
-    // Kernel props
-    int max_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS));
-
-    // Copy problem to device
-    cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-    printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n",
-        (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS",
-        TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy);
-
-    // Run aggregate/prefix kernel
-    BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-        d_in,
-        d_out,
-        d_elapsed);
-
-    // Check results
-    printf("\tOutput items: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check total aggregate
-    printf("\tAggregate: ");
-    compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Run this several times and average the performance results
-    GpuTimer    timer;
-    float       elapsed_millis          = 0.0;
-    clock_t     elapsed_clocks          = 0;
-
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Copy problem to device
-        cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice);
-
-        timer.Start();
-
-        // Run aggregate/prefix kernel
-        BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            d_elapsed);
-
-        timer.Stop();
-        elapsed_millis += timer.ElapsedMillis();
-
-        // Copy clocks from device
-        clock_t clocks;
-        CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost));
-        elapsed_clocks += clocks;
-
-    }
-
-    // Check for kernel errors and STDIO from the kernel, if any
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Display timing results
-    float avg_millis            = elapsed_millis / g_timing_iterations;
-    float avg_items_per_sec     = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f;
-    float avg_clocks            = float(elapsed_clocks) / g_timing_iterations;
-    float avg_clocks_per_item   = avg_clocks / TILE_SIZE;
-
-    printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks);
-    printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item);
-    printf("\tAverage kernel millis: %.4f\n", avg_millis);
-    printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_gpu) delete[] h_gpu;
-    if (d_in) cudaFree(d_in);
-    if (d_out) cudaFree(d_out);
-    if (d_elapsed) cudaFree(d_elapsed);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("grid-size", g_grid_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations (default:%d)>]"
-            "[--grid-size=<grid size (default:%d)>]"
-            "[--v] "
-            "\n", argv[0], g_timing_iterations, g_grid_size);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run tests
-    Test<1024, 1, BLOCK_SCAN_RAKING>();
-    Test<512, 2, BLOCK_SCAN_RAKING>();
-    Test<256, 4, BLOCK_SCAN_RAKING>();
-    Test<128, 8, BLOCK_SCAN_RAKING>();
-    Test<64, 16, BLOCK_SCAN_RAKING>();
-    Test<32, 32, BLOCK_SCAN_RAKING>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>();
-    Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>();
-
-    printf("-------------\n");
-
-    Test<1024, 1, BLOCK_SCAN_WARP_SCANS>();
-    Test<512, 2, BLOCK_SCAN_WARP_SCANS>();
-    Test<256, 4, BLOCK_SCAN_WARP_SCANS>();
-    Test<128, 8, BLOCK_SCAN_WARP_SCANS>();
-    Test<64, 16, BLOCK_SCAN_WARP_SCANS>();
-    Test<32, 32, BLOCK_SCAN_WARP_SCANS>();
-
-
-    return 0;
-}
-
diff --git a/external/cub/examples/block/reduce_by_key.cu b/external/cub/examples/block/reduce_by_key.cu
deleted file mode 100644
index d74e1624423..00000000000
--- a/external/cub/examples/block/reduce_by_key.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-#include <cub/cub.cuh>
-
-
-template <
-    int         BLOCK_THREADS,          ///< Number of CTA threads
-    typename    KeyT,                   ///< Key type
-    typename    ValueT>                 ///< Value type
-__global__ void Kernel()
-{
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef cub::KeyValuePair<int, ValueT> OffsetValuePairT;
-
-    // Reduce-value-by-segment scan operator
-    typedef cub::ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Parameterized BlockDiscontinuity type for setting head flags
-    typedef cub::BlockDiscontinuity<
-            KeyT,
-            BLOCK_THREADS>
-        BlockDiscontinuityKeysT;
-
-    // Parameterized BlockScan type
-    typedef cub::BlockScan<
-            OffsetValuePairT,
-            BLOCK_THREADS,
-            cub::BLOCK_SCAN_WARP_SCANS>
-        BlockScanT;
-
-    // Shared memory
-    __shared__ union TempStorage
-    {
-        typename BlockScanT::TempStorage                scan;           // Scan storage
-        typename BlockDiscontinuityKeysT::TempStorage   discontinuity;  // Discontinuity storage
-    } temp_storage;
-
-
-    // Read data (each thread gets 3 items each, every 9 items is a segment)
-    KeyT    my_keys[3]      = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3};
-    ValueT  my_values[3]    = {1, 1, 1};
-
-    // Set head segment head flags
-    int     my_flags[3];
-    BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads(
-        my_flags,
-        my_keys,
-        cub::Inequality());
-
-    __syncthreads();
-
-
-
-
-
-
-}
diff --git a/external/cub/examples/device/Makefile b/external/cub/examples/device/Makefile
deleted file mode 100644
index 45b6209baf9..00000000000
--- a/external/cub/examples/device/Makefile
+++ /dev/null
@@ -1,197 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-#-------------------------------------------------------------------------------
-#
-# Makefile usage
-#
-# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
-#
-#-------------------------------------------------------------------------------
- 
-include ../../common.mk 
- 
- 
-#-------------------------------------------------------------------------------
-# Includes
-#-------------------------------------------------------------------------------
-
-INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
-
-
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-DEPS =				$(CUB_DEPS) \
-					$(CUB_DIR)test/Makefile \
-					$(CUB_DIR)test/test_util.h \
-					$(CUB_DIR)test/mersenne.h \
-		
-ALL = 	example_device_partition_flagged \
-		example_device_partition_if \
-	 	example_device_radix_sort \
-		example_device_reduce \
-	 	example_device_scan \
-	 	example_device_select_unique \
-		example_device_select_flagged \
-		example_device_select_if \
-		example_device_sort_find_non_trivial_runs
-		
-
-
-#-------------------------------------------------------------------------------
-# make default
-#-------------------------------------------------------------------------------
-
-default:
-
-
-#-------------------------------------------------------------------------------
-# make clean
-#-------------------------------------------------------------------------------
-
-clean :
-	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
-	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
-
-
-#-------------------------------------------------------------------------------
-# make all
-#-------------------------------------------------------------------------------
-
-all : $(ALL)
-
-#-------------------------------------------------------------------------------
-# make run
-#-------------------------------------------------------------------------------
-
-run : 
-	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
-
-
-#-------------------------------------------------------------------------------
-# make example_device_reduce
-#-------------------------------------------------------------------------------
-
-example_device_reduce: bin/example_device_reduce_$(BIN_SUFFIX)
-
-bin/example_device_reduce_$(BIN_SUFFIX) : example_device_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_reduce_$(BIN_SUFFIX) example_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_device_partition_flagged
-#-------------------------------------------------------------------------------
-
-example_device_partition_flagged: bin/example_device_partition_flagged_$(BIN_SUFFIX)
-
-bin/example_device_partition_flagged_$(BIN_SUFFIX) : example_device_partition_flagged.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_flagged_$(BIN_SUFFIX) example_device_partition_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make example_device_partition_if
-#-------------------------------------------------------------------------------
-
-example_device_partition_if: bin/example_device_partition_if_$(BIN_SUFFIX)
-
-bin/example_device_partition_if_$(BIN_SUFFIX) : example_device_partition_if.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_partition_if_$(BIN_SUFFIX) example_device_partition_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make example_device_scan
-#-------------------------------------------------------------------------------
-
-example_device_scan: bin/example_device_scan_$(BIN_SUFFIX)
-
-bin/example_device_scan_$(BIN_SUFFIX) : example_device_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_scan_$(BIN_SUFFIX) example_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_device_radix_sort
-#-------------------------------------------------------------------------------
-
-example_device_radix_sort: bin/example_device_radix_sort_$(BIN_SUFFIX)
-
-bin/example_device_radix_sort_$(BIN_SUFFIX) : example_device_radix_sort.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_radix_sort_$(BIN_SUFFIX) example_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_device_select_unique
-#-------------------------------------------------------------------------------
-
-example_device_select_unique: bin/example_device_select_unique_$(BIN_SUFFIX)
-
-bin/example_device_select_unique_$(BIN_SUFFIX) : example_device_select_unique.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_unique_$(BIN_SUFFIX) example_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_device_select_flagged
-#-------------------------------------------------------------------------------
-
-example_device_select_flagged: bin/example_device_select_flagged_$(BIN_SUFFIX)
-
-bin/example_device_select_flagged_$(BIN_SUFFIX) : example_device_select_flagged.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_flagged_$(BIN_SUFFIX) example_device_select_flagged.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make example_device_select_if
-#-------------------------------------------------------------------------------
-
-example_device_select_if: bin/example_device_select_if_$(BIN_SUFFIX)
-
-bin/example_device_select_if_$(BIN_SUFFIX) : example_device_select_if.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_select_if_$(BIN_SUFFIX) example_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make example_device_sort_find_non_trivial_runs
-#-------------------------------------------------------------------------------
-
-example_device_sort_find_non_trivial_runs: bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX)
-
-bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) : example_device_sort_find_non_trivial_runs.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_device_sort_find_non_trivial_runs_$(BIN_SUFFIX) example_device_sort_find_non_trivial_runs.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-
diff --git a/external/cub/examples/device/example_device_partition_flagged.cu b/external/cub/examples/device/example_device_partition_flagged.cu
deleted file mode 100644
index 0c9a6477a4d..00000000000
--- a/external/cub/examples/device/example_device_partition_flagged.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DevicePartition::Flagged().
- *
- * Partition flagged items from from a sequence of int keys using a
- * corresponding sequence of unsigned char flags.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_partition_flagged.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_partition.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting flags at distances of random length
- * chosen from [1..max_segment]
- */
-void Initialize(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             num_items,
-    int             max_segment)
-{
-    unsigned short max_short = (unsigned short) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_flags[j] = 0;
-            h_in[j] = key;
-            j++;
-        }
-
-        h_flags[i] = 1;
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("Flags:\n");
-        DisplayResults(h_flags, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (h_flags[i])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int             *h_in        = new int[num_items];
-    int             *h_reference = new int[num_items];
-    unsigned char   *h_flags     = new unsigned char[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, h_flags, num_items, max_segment);
-    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
-
-    printf("cub::DevicePartition::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int             *d_in = NULL;
-    unsigned char   *d_flags = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_partition_if.cu b/external/cub/examples/device/example_device_partition_if.cu
deleted file mode 100644
index 52ae2d6e49b..00000000000
--- a/external/cub/examples/device/example_device_partition_if.cu
+++ /dev/null
@@ -1,244 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DevicePartition::If().
- *
- * Partitions items from from a sequence of int keys using a
- * section functor (greater-than)
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_partition.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-/// Selection functor type
-struct GreaterThan
-{
-    int compare;
-
-    __host__ __device__ __forceinline__
-    GreaterThan(int compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const int &a) const {
-        return (a > compare);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <typename SelectOp>
-int Solve(
-    int             *h_in,
-    SelectOp        select_op,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (select_op(h_in[i]))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int *h_in        = new int[num_items];
-    int *h_reference = new int[num_items];
-
-    // DevicePartition a pivot index
-    unsigned int pivot_index;
-    unsigned int max_int = (unsigned int) -1;
-    RandomBits(pivot_index);
-    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
-    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    GreaterThan select_op(h_in[pivot_index]);
-
-    int num_selected = Solve(h_in, select_op, h_reference, num_items);
-
-    printf("cub::DevicePartition::If %d items, %d selected (avg run length %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_radix_sort.cu b/external/cub/examples/device/example_device_radix_sort.cu
deleted file mode 100644
index af5de82957c..00000000000
--- a/external/cub/examples/device/example_device_radix_sort.cu
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceRadixSort::SortPairs().
- *
- * Sorts an array of float keys paired with a corresponding array of int values.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_radix_sort.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Simple key-value pairing for floating point types.  Distinguishes
- * between positive and negative zero.
- */
-struct Pair
-{
-    float   key;
-    int     value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // Return true if key is negative zero and b.key is positive zero
-        unsigned int key_bits   = *reinterpret_cast<unsigned*>(const_cast<float*>(&key));
-        unsigned int b_key_bits = *reinterpret_cast<unsigned*>(const_cast<float*>(&b.key));
-        unsigned int HIGH_BIT   = 1u << 31;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key-value sorting problem.
- */
-void Initialize(
-    float           *h_keys,
-    int             *h_values,
-    float           *h_reference_keys,
-    int             *h_reference_values,
-    int             num_items)
-{
-    Pair *h_pairs = new Pair[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        RandomBits(h_keys[i]);
-        RandomBits(h_values[i]);
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    if (g_verbose)
-    {
-        printf("Input keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-
-        printf("Input values:\n");
-        DisplayResults(h_values, num_items);
-        printf("\n\n");
-    }
-
-    std::stable_sort(h_pairs, h_pairs + num_items);
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_keys[i]     = h_pairs[i].key;
-        h_reference_values[i]   = h_pairs[i].value;
-    }
-
-    delete[] h_pairs;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceRadixSort::SortPairs() %d items (%d-byte keys %d-byte values)\n",
-        num_items, int(sizeof(float)), int(sizeof(int)));
-    fflush(stdout);
-
-    // Allocate host arrays
-    float   *h_keys             = new float[num_items];
-    float   *h_reference_keys   = new float[num_items];
-    int     *h_values           = new int[num_items];
-    int     *h_reference_values = new int[num_items];
-
-    // Initialize problem and solution on host
-    Initialize(h_keys, h_values, h_reference_keys, h_reference_values, num_items);
-
-    // Allocate device arrays
-    DoubleBuffer<float> d_keys;
-    DoubleBuffer<int>   d_values;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(float) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(float) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(int) * num_items));
-
-    // Allocate temporary storage
-    size_t  temp_storage_bytes  = 0;
-    void    *d_temp_storage     = NULL;
-
-    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Initialize device arrays
-    CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Run
-    CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
-    printf("\t Compare keys (selector %d): %s\n", d_keys.selector, compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
-    printf("\t Compare values (selector %d): %s\n", d_values.selector, compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_keys) delete[] h_keys;
-    if (h_reference_keys) delete[] h_reference_keys;
-    if (h_values) delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-
-    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
-    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
-    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
-    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_reduce.cu b/external/cub/examples/device/example_device_reduce.cu
deleted file mode 100644
index 8d160509ff8..00000000000
--- a/external/cub/examples/device/example_device_reduce.cu
+++ /dev/null
@@ -1,180 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceReduce::Sum().
- *
- * Sums an array of int keys.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_reduce.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem
- */
-void Initialize(
-    int   *h_in,
-    int     num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-        h_in[i] = i;
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Compute solution
- */
-void Solve(
-    int           *h_in,
-    int           &h_reference,
-    int             num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (i == 0)
-            h_reference = h_in[0];
-        else
-            h_reference += h_in[i];
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
-        num_items, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate host arrays
-    int* h_in = new int[num_items];
-    int  h_reference;
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items);
-    Solve(h_in, h_reference, num_items);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array
-    int *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
-
-    // Request and allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_scan.cu b/external/cub/examples/device/example_device_scan.cu
deleted file mode 100644
index 53f591cf654..00000000000
--- a/external/cub/examples/device/example_device_scan.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceScan::ExclusiveSum().
- *
- * Computes an exclusive sum of int keys.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_scan.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-void Initialize(
-    int        *h_in,
-    int          num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-        h_in[i] = i;
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-/**
- * Solve exclusive-scan problem
- */
-int Solve(
-    int           *h_in,
-    int           *h_reference,
-    int             num_items)
-{
-    int inclusive = 0;
-    int aggregate = 0;
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference[i] = inclusive;
-        inclusive += h_in[i];
-        aggregate += h_in[i];
-    }
-
-    return aggregate;
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = 150;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
-        num_items, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate host arrays
-    int*  h_in = new int[num_items];
-    int*  h_reference = new int[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items);
-    Solve(h_in, h_reference, num_items);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array
-    int *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_select_flagged.cu b/external/cub/examples/device/example_device_select_flagged.cu
deleted file mode 100644
index 00cf7a24f44..00000000000
--- a/external/cub/examples/device/example_device_select_flagged.cu
+++ /dev/null
@@ -1,233 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::Flagged().
- *
- * Selects flagged items from from a sequence of int keys using a
- * corresponding sequence of unsigned char flags.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_flagged.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting flags at distances of random length
- * chosen from [1..max_segment]
- */
-void Initialize(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             num_items,
-    int             max_segment)
-{
-    unsigned short max_short = (unsigned short) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_flags[j] = 0;
-            h_in[j] = key;
-            j++;
-        }
-
-        h_flags[i] = 1;
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("Flags:\n");
-        DisplayResults(h_flags, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int             *h_in,
-    unsigned char   *h_flags,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (h_flags[i])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int             *h_in        = new int[num_items];
-    int             *h_reference = new int[num_items];
-    unsigned char   *h_flags     = new unsigned char[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, h_flags, num_items, max_segment);
-    int num_selected = Solve(h_in, h_flags, h_reference, num_items);
-
-    printf("cub::DeviceSelect::Flagged %d items, %d selected (avg distance %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int             *d_in = NULL;
-    unsigned char   *d_flags = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(unsigned char) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(unsigned char) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare |= CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_select_if.cu b/external/cub/examples/device/example_device_select_if.cu
deleted file mode 100644
index 5055f449d55..00000000000
--- a/external/cub/examples/device/example_device_select_if.cu
+++ /dev/null
@@ -1,242 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::If().
- *
- * Selects items from from a sequence of int keys using a
- * section functor (greater-than)
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_if.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-/// Selection functor type
-struct GreaterThan
-{
-    int compare;
-
-    __host__ __device__ __forceinline__
-    GreaterThan(int compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const int &a) const {
-        return (a > compare);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <typename SelectOp>
-int Solve(
-    int             *h_in,
-    SelectOp        select_op,
-    int             *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if (select_op(h_in[i]))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int *h_in        = new int[num_items];
-    int *h_reference = new int[num_items];
-
-    // Select a pivot index
-    unsigned int pivot_index;
-    unsigned int max_int = (unsigned int) -1;
-    RandomBits(pivot_index);
-    pivot_index = (unsigned int) ((float(pivot_index) * (float(num_items - 1) / float(max_int))));
-    printf("Pivot idx: %d\n", pivot_index); fflush(stdout);
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    GreaterThan select_op(h_in[pivot_index]);
-
-    int num_selected = Solve(h_in, select_op, h_reference, num_items);
-
-    printf("cub::DeviceSelect::If %d items, %d selected (avg run length %d), %d-byte elements\n",
-        num_items, num_selected, (num_selected > 0) ? num_items / num_selected : 0, (int) sizeof(int));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
diff --git a/external/cub/examples/device/example_device_select_unique.cu b/external/cub/examples/device/example_device_select_unique.cu
deleted file mode 100644
index b294a18c2c1..00000000000
--- a/external/cub/examples/device/example_device_select_unique.cu
+++ /dev/null
@@ -1,221 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of DeviceSelect::Unique().
- *
- * Selects the first element from each run of identical values from a sequence
- * of int keys.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_select_unique.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem, setting runs of random length chosen from [1..max_segment]
- */
-void Initialize(
-    int     *h_in,
-    int     num_items,
-    int     max_segment)
-{
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Randomly select number of repeating occurrences uniformly from [1..max_segment]
-        unsigned short max_short = (unsigned short) -1;
-        unsigned short repeat;
-        RandomBits(repeat);
-        repeat = (unsigned short) ((float(repeat) * (float(max_segment) / float(max_short))));
-        repeat = CUB_MAX(1, repeat);
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            h_in[j] = key;
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-int Solve(
-    int         *h_in,
-    int         *h_reference,
-    int         num_items)
-{
-    int num_selected = 0;
-    if (num_items > 0)
-    {
-        h_reference[num_selected] = h_in[0];
-        num_selected++;
-    }
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (h_in[i] != h_in[i - 1])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-    }
-
-    return num_selected;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = 150;
-    int max_segment         = 40;       // Maximum segment length
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxseg", max_segment);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays
-    int*  h_in        = new int[num_items];
-    int*  h_reference = new int[num_items];
-
-    // Initialize problem and solution
-    Initialize(h_in, num_items, max_segment);
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("cub::DeviceSelect::Unique %d items (%d-byte elements), %d selected (avg run length %d)\n",
-        num_items, (int) sizeof(int), num_selected, num_items / num_selected);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    int *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-    // Allocate device output array and num selected
-    int     *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Run
-    CubDebugExit(DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare ? "FAIL" : "PASS");
-    compare = compare | CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    printf("\n\n");
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu b/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
deleted file mode 100644
index 86d4ac55bd1..00000000000
--- a/external/cub/examples/device/example_device_sort_find_non_trivial_runs.cu
+++ /dev/null
@@ -1,384 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple example of sorting a sequence of keys and values (each pair is a
- * randomly-selected int32 paired with its original offset in the unsorted sequence), and then
- * isolating all maximal, non-trivial (having length > 1) "runs" of duplicates.
- *
- * To compile using the command line:
- *   nvcc -arch=sm_XX example_device_sort_find_non_trivial_runs.cu -I../.. -lcudart -O3
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-
-#include "../../test/test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Simple key-value pairing for using std::sort on key-value pairs.
- */
-template <typename Key, typename Value>
-struct Pair
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-
-/**
- * Pair ostream operator
- */
-template <typename Key, typename Value>
-std::ostream& operator<<(std::ostream& os, const Pair<Key, Value>& val)
-{
-    os << '<' << val.key << ',' << val.value << '>';
-    return os;
-}
-
-
-/**
- * Initialize problem
- */
-template <typename Key, typename Value>
-void Initialize(
-    Key    *h_keys,
-    Value  *h_values,
-    int    num_items,
-    int    max_key)
-{
-    float scale = float(max_key) / float(UINT_MAX);
-    for (int i = 0; i < num_items; ++i)
-    {
-        Key sample;
-        RandomBits(sample);
-        h_keys[i] = (max_key == -1) ? i : (Key) (scale * sample);
-        h_values[i] = i;
-    }
-
-    if (g_verbose)
-    {
-        printf("Keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-
-        printf("Values:\n");
-        DisplayResults(h_values, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve sorted non-trivial subrange problem.  Returns the number
- * of non-trivial runs found.
- */
-template <typename Key, typename Value>
-int Solve(
-    Key     *h_keys,
-    Value   *h_values,
-    int     num_items,
-    int     *h_offsets_reference,
-    int     *h_lengths_reference)
-{
-    // Sort
-
-    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    std::stable_sort(h_pairs, h_pairs + num_items);
-
-    if (g_verbose)
-    {
-        printf("Sorted pairs:\n");
-        DisplayResults(h_pairs, num_items);
-        printf("\n\n");
-    }
-
-    // Find non-trivial runs
-
-    Key     previous        = h_pairs[0].key;
-    int     length          = 1;
-    int     num_runs        = 0;
-    int     run_begin       = 0;
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (previous != h_pairs[i].key)
-        {
-            if (length > 1)
-            {
-                h_offsets_reference[num_runs]     = run_begin;
-                h_lengths_reference[num_runs]     = length;
-                num_runs++;
-            }
-            length = 1;
-            run_begin = i;
-        }
-        else
-        {
-            length++;
-        }
-        previous = h_pairs[i].key;
-    }
-
-    if (length > 1)
-    {
-        h_offsets_reference[num_runs]   = run_begin;
-        h_lengths_reference[num_runs]   = length;
-        num_runs++;
-    }
-
-    delete[] h_pairs;
-
-    return num_runs;
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    typedef unsigned int    Key;
-    typedef int             Value;
-
-    int timing_iterations   = 0;
-    int num_items           = 40;
-    Key max_key             = 20;       // Max item
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("maxkey", max_key);
-    args.GetCmdLineArgument("i", timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--i=<timing iterations> "
-            "[--n=<input items, default 40> "
-            "[--maxkey=<max key, default 20 (use -1 to test only unique keys)>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Allocate host arrays (problem and reference solution)
-
-    Key     *h_keys                 = new Key[num_items];
-    Value   *h_values               = new Value[num_items];
-    int     *h_offsets_reference    = new int[num_items];
-    int     *h_lengths_reference    = new int[num_items];
-
-    // Initialize key-value pairs and compute reference solution (sort them, and identify non-trivial runs)
-    printf("Computing reference solution on CPU for %d items (max key %d)\n", num_items, max_key);
-    fflush(stdout);
-
-    Initialize(h_keys, h_values, num_items, max_key);
-    int num_runs = Solve(h_keys, h_values, num_items, h_offsets_reference, h_lengths_reference);
-
-    printf("%d non-trivial runs\n", num_runs);
-    fflush(stdout);
-
-    // Repeat for performance timing
-    GpuTimer gpu_timer;
-    GpuTimer gpu_rle_timer;
-    float elapsed_millis = 0.0;
-    float elapsed_rle_millis = 0.0;
-    for (int i = 0; i <= timing_iterations; ++i)
-    {
-
-        // Allocate and initialize device arrays for sorting
-        DoubleBuffer<Key>       d_keys;
-        DoubleBuffer<Value>     d_values;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(Key) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(Key) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(Value) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(Value) * num_items));
-
-        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(float) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(int) * num_items, cudaMemcpyHostToDevice));
-
-        // Start timer
-        gpu_timer.Start();
-
-        // Allocate temporary storage for sorting
-        size_t  temp_storage_bytes  = 0;
-        void    *d_temp_storage     = NULL;
-        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Do the sort
-        CubDebugExit(DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items));
-
-        // Free unused buffers and sorting temporary storage
-        if (d_keys.d_buffers[d_keys.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector ^ 1]));
-        if (d_values.d_buffers[d_values.selector ^ 1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector ^ 1]));
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-        // Start timer
-        gpu_rle_timer.Start();
-
-        // Allocate device arrays for enumerating non-trivial runs
-        int     *d_offests_out   = NULL;
-        int     *d_lengths_out   = NULL;
-        int     *d_num_runs      = NULL;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offests_out, sizeof(int) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(int) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int) * 1));
-
-        // Allocate temporary storage for isolating non-trivial runs
-        d_temp_storage = NULL;
-        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys.d_buffers[d_keys.selector],
-            d_offests_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Do the isolation
-        CubDebugExit(DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys.d_buffers[d_keys.selector],
-            d_offests_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items));
-
-        // Free keys buffer
-        if (d_keys.d_buffers[d_keys.selector]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[d_keys.selector]));
-
-        //
-        // Hypothetically do stuff with the original key-indices corresponding to non-trivial runs of identical keys
-        //
-
-        // Stop sort timer
-        gpu_timer.Stop();
-        gpu_rle_timer.Stop();
-
-        if (i == 0)
-        {
-            // First iteration is a warmup: // Check for correctness (and display results, if specified)
-
-            printf("\nRUN OFFSETS: \n");
-            int compare = CompareDeviceResults(h_offsets_reference, d_offests_out, num_runs, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            printf("\nRUN LENGTHS: \n");
-            compare |= CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            printf("\nNUM RUNS: \n");
-            compare |= CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
-            printf("\t\t %s ", compare ? "FAIL" : "PASS");
-
-            AssertEquals(0, compare);
-        }
-        else
-        {
-            elapsed_millis += gpu_timer.ElapsedMillis();
-            elapsed_rle_millis += gpu_rle_timer.ElapsedMillis();
-        }
-
-        // GPU cleanup
-
-        if (d_values.d_buffers[d_values.selector]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[d_values.selector]));
-        if (d_offests_out) CubDebugExit(g_allocator.DeviceFree(d_offests_out));
-        if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
-        if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    }
-
-    // Host cleanup
-    if (h_keys) delete[] h_keys;
-    if (h_values) delete[] h_values;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-
-    printf("\n\n");
-
-    if (timing_iterations > 0)
-    {
-        printf("%d timing iterations, average time to sort and isolate non-trivial duplicates: %.3f ms (%.3f ms spent in RLE isolation)\n",
-            timing_iterations,
-            elapsed_millis / timing_iterations,
-            elapsed_rle_millis / timing_iterations);
-    }
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/experimental/Makefile b/external/cub/experimental/Makefile
deleted file mode 100644
index 77810746c7f..00000000000
--- a/external/cub/experimental/Makefile
+++ /dev/null
@@ -1,125 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-#-------------------------------------------------------------------------------
-#
-# Makefile usage
-#
-# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>]
-#
-#-------------------------------------------------------------------------------
- 
-include ../common.mk 
-
-#-------------------------------------------------------------------------------
-# Commandline Options
-#-------------------------------------------------------------------------------
-
-# [mkl=<0|1>] compile against Intel MKL
-ifeq ($(mkl), 1)
-	DEFINES 	+= -DCUB_MKL
-
-ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
-	LIBS 		+=	mkl_intel_lp64.lib mkl_intel_thread.lib  mkl_core.lib libiomp5md.lib
-	NVCCFLAGS 	+= -Xcompiler /openmp
-else
-	LIBS		+= -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm
-	NVCCFLAGS 	+= -Xcompiler -fopenmp
-	
-endif	
-
-endif
-
-
-#-------------------------------------------------------------------------------
-# Compiler and compilation platform
-#-------------------------------------------------------------------------------
-
-# Includes
-INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
-
-# detect OS
-OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-exp_rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-EXP_DEPS = 	$(call rwildcard, ./,*.cuh) \
-			$(call rwildcard, ./,*.h)
-
-DEPS =				$(CUB_DEPS) \
-					$(EXP_DEPS) \
-					$(CUB_DIR)test/Makefile \
-					$(CUB_DIR)test/test_util.h \
-					$(CUB_DIR)test/mersenne.h \
-
-		
-
-#-------------------------------------------------------------------------------
-# make default
-#-------------------------------------------------------------------------------
-
-default:
-
-
-#-------------------------------------------------------------------------------
-# make clean
-#-------------------------------------------------------------------------------
-
-clean :
-	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
-	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
-
-
-
-#-------------------------------------------------------------------------------
-# make histogram_compare
-#-------------------------------------------------------------------------------
-
-histogram_compare: bin/histogram_compare_$(BIN_SUFFIX)
-
-bin/histogram_compare_$(BIN_SUFFIX) : histogram_compare.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/histogram_compare_$(BIN_SUFFIX) histogram_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-	
-
-
-#-------------------------------------------------------------------------------
-# make spmv_compare
-#-------------------------------------------------------------------------------
-
-spmv_compare: bin/spmv_compare_$(BIN_SUFFIX)
-
-bin/spmv_compare_$(BIN_SUFFIX) : spmv_compare.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/spmv_compare_$(BIN_SUFFIX) spmv_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse $(MKL_LIBS) -O3
-	
-
diff --git a/external/cub/experimental/defunct/example_coo_spmv.cu b/external/cub/experimental/defunct/example_coo_spmv.cu
deleted file mode 100644
index d60697d579c..00000000000
--- a/external/cub/experimental/defunct/example_coo_spmv.cu
+++ /dev/null
@@ -1,1070 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * An implementation of COO SpMV using prefix scan to implement a
- * reduce-value-by-row strategy
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-#include "coo_graph.cuh"
-#include "../test/test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-/******************************************************************************
- * Globals, constants, and typedefs
- ******************************************************************************/
-
-typedef int         VertexId;   // uint32s as vertex ids
-typedef double      Value;      // double-precision floating point values
-
-bool                    g_verbose       = false;
-int                     g_timing_iterations    = 1;
-CachingDeviceAllocator  g_allocator;
-
-
-/******************************************************************************
- * Texture referencing
- ******************************************************************************/
-
-/**
- * Templated texture reference type for multiplicand vector
- */
-template <typename Value>
-struct TexVector
-{
-    // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items)
-    typedef typename If<(Equals<Value, double>::VALUE), uint2, Value>::Type CastType;
-
-    // Texture reference type
-    typedef texture<CastType, cudaTextureType1D, cudaReadModeElementType> TexRef;
-
-    static TexRef ref;
-
-    /**
-     * Bind textures
-     */
-    static void BindTexture(void *d_in, int elements)
-    {
-        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<CastType>();
-        if (d_in)
-        {
-            size_t offset;
-            size_t bytes = sizeof(CastType) * elements;
-            CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes));
-        }
-    }
-
-    /**
-     * Unbind textures
-     */
-    static void UnbindTexture()
-    {
-        CubDebugExit(cudaUnbindTexture(ref));
-    }
-
-    /**
-     * Load
-     */
-    static __device__ __forceinline__ Value Load(int offset)
-    {
-        Value output;
-        reinterpret_cast<typename TexVector<Value>::CastType &>(output) = tex1Dfetch(TexVector<Value>::ref, offset);
-        return output;
-    }
-};
-
-// Texture reference definitions
-template <typename Value>
-typename TexVector<Value>::TexRef TexVector<Value>::ref = 0;
-
-
-/******************************************************************************
- * Utility types
- ******************************************************************************/
-
-
-/**
- * A partial dot-product sum paired with a corresponding row-id
- */
-template <typename VertexId, typename Value>
-struct PartialProduct
-{
-    VertexId    row;            /// Row-id
-    Value       partial;        /// PartialProduct sum
-};
-
-
-/**
- * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings)
- */
-template <>
-struct PartialProduct<int, double>
-{
-    long long   row;            /// Row-id
-    double      partial;        /// PartialProduct sum
-};
-
-
-/**
- * Reduce-value-by-row scan operator
- */
-struct ReduceByKeyOp
-{
-    template <typename PartialProduct>
-    __device__ __forceinline__ PartialProduct operator()(
-        const PartialProduct &first,
-        const PartialProduct &second)
-    {
-        PartialProduct retval;
-
-        retval.partial = (second.row != first.row) ?
-                second.partial :
-                first.partial + second.partial;
-
-        retval.row = second.row;
-        return retval;
-    }
-};
-
-
-/**
- * Stateful block-wide prefix operator for BlockScan
- */
-template <typename PartialProduct>
-struct BlockPrefixCallbackOp
-{
-    // Running block-wide prefix
-    PartialProduct running_prefix;
-
-    /**
-     * Returns the block-wide running_prefix in thread-0
-     */
-    __device__ __forceinline__ PartialProduct operator()(
-        const PartialProduct &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        ReduceByKeyOp scan_op;
-
-        PartialProduct retval = running_prefix;
-        running_prefix = scan_op(running_prefix, block_aggregate);
-        return retval;
-    }
-};
-
-
-/**
- * Operator for detecting discontinuities in a list of row identifiers.
- */
-struct NewRowOp
-{
-    /// Returns true if row_b is the start of a new row
-    template <typename VertexId>
-    __device__ __forceinline__ bool operator()(
-        const VertexId& row_a,
-        const VertexId& row_b)
-    {
-        return (row_a != row_b);
-    }
-};
-
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * SpMV thread block abstraction for processing a contiguous segment of
- * sparse COO tiles.
- */
-template <
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD,
-    typename        VertexId,
-    typename        Value>
-struct PersistentBlockSpmv
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Partial dot product type
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    // Parameterized BlockScan type for reduce-value-by-row scan
-    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
-
-    // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements
-    typedef BlockExchange<VertexId, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeRows;
-
-    // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements
-    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD, true> BlockExchangeValues;
-
-    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
-    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
-
-    // Shared memory type for this thread block
-    struct TempStorage
-    {
-        union
-        {
-            typename BlockExchangeRows::TempStorage         exchange_rows;      // Smem needed for BlockExchangeRows
-            typename BlockExchangeValues::TempStorage       exchange_values;    // Smem needed for BlockExchangeValues
-            struct
-            {
-                typename BlockScan::TempStorage             scan;               // Smem needed for BlockScan
-                typename BlockDiscontinuity::TempStorage    discontinuity;      // Smem needed for BlockDiscontinuity
-            };
-        };
-
-        VertexId        first_block_row;    ///< The first row-ID seen by this thread block
-        VertexId        last_block_row;     ///< The last row-ID seen by this thread block
-        Value           first_product;      ///< The first dot-product written by this thread block
-    };
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    TempStorage                     &temp_storage;
-    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
-    VertexId                        *d_rows;
-    VertexId                        *d_columns;
-    Value                           *d_values;
-    Value                           *d_vector;
-    Value                           *d_result;
-    PartialProduct                  *d_block_partials;
-    int                             block_offset;
-    int                             block_end;
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    PersistentBlockSpmv(
-        TempStorage                 &temp_storage,
-        VertexId                    *d_rows,
-        VertexId                    *d_columns,
-        Value                       *d_values,
-        Value                       *d_vector,
-        Value                       *d_result,
-        PartialProduct              *d_block_partials,
-        int                         block_offset,
-        int                         block_end)
-    :
-        temp_storage(temp_storage),
-        d_rows(d_rows),
-        d_columns(d_columns),
-        d_values(d_values),
-        d_vector(d_vector),
-        d_result(d_result),
-        d_block_partials(d_block_partials),
-        block_offset(block_offset),
-        block_end(block_end)
-    {
-        // Initialize scalar shared memory values
-        if (threadIdx.x == 0)
-        {
-            VertexId first_block_row            = d_rows[block_offset];
-            VertexId last_block_row             = d_rows[block_end - 1];
-
-            temp_storage.first_block_row        = first_block_row;
-            temp_storage.last_block_row         = last_block_row;
-            temp_storage.first_product          = Value(0);
-
-            // Initialize prefix_op to identity
-            prefix_op.running_prefix.row        = first_block_row;
-            prefix_op.running_prefix.partial    = Value(0);
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Processes a COO input tile of edges, outputting dot products for each row
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        int block_offset,
-        int guarded_items = 0)
-    {
-        VertexId        columns[ITEMS_PER_THREAD];
-        VertexId        rows[ITEMS_PER_THREAD];
-        Value           values[ITEMS_PER_THREAD];
-        PartialProduct  partial_sums[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a thread block-striped tile of A (sparse row-ids, column-ids, and values)
-        if (FULL_TILE)
-        {
-            // Unguarded loads
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns);
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values);
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows);
-        }
-        else
-        {
-            // This is a partial-tile (e.g., the last tile of input).  Extend the coordinates of the last
-            // vertex for out-of-bound items, but zero-valued
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0));
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0));
-            LoadDirectWarpStriped<LOAD_DEFAULT>(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row);
-        }
-
-        // Load the referenced values from x and compute the dot product partials sums
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-#if CUB_PTX_ARCH >= 350
-            values[ITEM] *= ThreadLoad<LOAD_LDG>(d_vector + columns[ITEM]);
-#else
-            values[ITEM] *= TexVector<Value>::Load(columns[ITEM]);
-#endif
-        }
-
-        // Transpose from warp-striped to blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values);
-
-        __syncthreads();
-
-        // Transpose from warp-striped to blocked arrangement
-        BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows);
-
-        // Barrier for smem reuse and coherence
-        __syncthreads();
-
-        // FlagT row heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            head_flags,                     // (Out) Head flags
-            rows,                           // Original row ids
-            NewRowOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_prefix.row);  // Last row ID from previous tile to compare with first row ID in this tile
-
-        // Assemble partial product structures
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            partial_sums[ITEM].partial = values[ITEM];
-            partial_sums[ITEM].row = rows[ITEM];
-        }
-
-        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
-        PartialProduct block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_sums,                   // Scan input
-            partial_sums,                   // Scan output
-            ReduceByKeyOp(),                // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Barrier for smem reuse and coherence
-        __syncthreads();
-
-        // Scatter an accumulated dot product if it is the head of a valid row
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
-
-                // Save off the first partial product that this thread block will scatter
-                if (partial_sums[ITEM].row == temp_storage.first_block_row)
-                {
-                    temp_storage.first_product = partial_sums[ITEM].partial;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessTiles()
-    {
-        // Process full tiles
-        while (block_offset <= block_end - TILE_ITEMS)
-        {
-            ProcessTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process the last, partially-full tile (if present)
-        int guarded_items = block_end - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, guarded_items);
-        }
-
-        if (threadIdx.x == 0)
-        {
-            if (gridDim.x == 1)
-            {
-                // Scatter the final aggregate (this kernel contains only 1 thread block)
-                d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
-            }
-            else
-            {
-                // Write the first and last partial products from this thread block so
-                // that they can be subsequently "fixed up" in the next kernel.
-
-                PartialProduct first_product;
-                first_product.row       = temp_storage.first_block_row;
-                first_product.partial   = temp_storage.first_product;
-
-                d_block_partials[blockIdx.x * 2]          = first_product;
-                d_block_partials[(blockIdx.x * 2) + 1]    = prefix_op.running_prefix;
-            }
-        }
-    }
-};
-
-
-/**
- * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products.
- */
-template <
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD,
-    typename        VertexId,
-    typename        Value>
-struct FinalizeSpmvBlock
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Partial dot product type
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    // Parameterized BlockScan type for reduce-value-by-row scan
-    typedef BlockScan<PartialProduct, BLOCK_THREADS, BLOCK_SCAN_RAKING_MEMOIZE> BlockScan;
-
-    // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment
-    typedef BlockDiscontinuity<HeadFlag, BLOCK_THREADS> BlockDiscontinuity;
-
-    // Shared memory type for this thread block
-    struct TempStorage
-    {
-        typename BlockScan::TempStorage           scan;               // Smem needed for reduce-value-by-row scan
-        typename BlockDiscontinuity::TempStorage  discontinuity;      // Smem needed for head-flagging
-
-        VertexId last_block_row;
-    };
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    TempStorage                     &temp_storage;
-    BlockPrefixCallbackOp<PartialProduct>   prefix_op;
-    Value                           *d_result;
-    PartialProduct                  *d_block_partials;
-    int                             num_partials;
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    FinalizeSpmvBlock(
-        TempStorage                 &temp_storage,
-        Value                       *d_result,
-        PartialProduct              *d_block_partials,
-        int                         num_partials)
-    :
-        temp_storage(temp_storage),
-        d_result(d_result),
-        d_block_partials(d_block_partials),
-        num_partials(num_partials)
-    {
-        // Initialize scalar shared memory values
-        if (threadIdx.x == 0)
-        {
-            VertexId first_block_row            = d_block_partials[0].row;
-            VertexId last_block_row             = d_block_partials[num_partials - 1].row;
-            temp_storage.last_block_row         = last_block_row;
-
-            // Initialize prefix_op to identity
-            prefix_op.running_prefix.row        = first_block_row;
-            prefix_op.running_prefix.partial    = Value(0);
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Processes a COO input tile of edges, outputting dot products for each row
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__
-    void ProcessTile(
-        int block_offset,
-        int guarded_items = 0)
-    {
-        VertexId        rows[ITEMS_PER_THREAD];
-        PartialProduct  partial_sums[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a tile of block partials from previous kernel
-        if (FULL_TILE)
-        {
-            // Full tile
-#if CUB_PTX_ARCH >= 350
-            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums);
-#else
-            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums);
-#endif
-        }
-        else
-        {
-            // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items)
-            PartialProduct default_sum;
-            default_sum.row = temp_storage.last_block_row;
-            default_sum.partial = Value(0);
-
-#if CUB_PTX_ARCH >= 350
-            LoadDirectBlocked<LOAD_LDG>(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
-#else
-            LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum);
-#endif
-        }
-
-        // Copy out row IDs for row-head flagging
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            rows[ITEM] = partial_sums[ITEM].row;
-        }
-
-        // FlagT row heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            rows,                           // Original row ids
-            head_flags,                     // (Out) Head flags
-            NewRowOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_prefix.row);   // Last row ID from previous tile to compare with first row ID in this tile
-
-        // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan
-        PartialProduct block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_sums,                   // Scan input
-            partial_sums,                   // Scan output
-            ReduceByKeyOp(),                // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Scatter an accumulated dot product if it is the head of a valid row
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial;
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessTiles()
-    {
-        // Process full tiles
-        int block_offset = 0;
-        while (block_offset <= num_partials - TILE_ITEMS)
-        {
-            ProcessTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process final partial tile (if present)
-        int guarded_items = num_partials - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, guarded_items);
-        }
-
-        // Scatter the final aggregate (this kernel contains only 1 thread block)
-        if (threadIdx.x == 0)
-        {
-            d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial;
-        }
-    }
-};
-
-
-/******************************************************************************
- * Kernel entrypoints
- ******************************************************************************/
-
-
-
-/**
- * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles.
- */
-template <
-    int                             BLOCK_THREADS,
-    int                             ITEMS_PER_THREAD,
-    typename                        VertexId,
-    typename                        Value>
-__launch_bounds__ (BLOCK_THREADS)
-__global__ void CooKernel(
-    GridEvenShare<int>              even_share,
-    PartialProduct<VertexId, Value> *d_block_partials,
-    VertexId                        *d_rows,
-    VertexId                        *d_columns,
-    Value                           *d_values,
-    Value                           *d_vector,
-    Value                           *d_result)
-{
-    // Specialize SpMV thread block abstraction type
-    typedef PersistentBlockSpmv<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> PersistentBlockSpmv;
-
-    // Shared memory allocation
-    __shared__ typename PersistentBlockSpmv::TempStorage temp_storage;
-
-    // Initialize thread block even-share to tell us where to start and stop our tile-processing
-    even_share.BlockInit();
-
-    // Construct persistent thread block
-    PersistentBlockSpmv persistent_block(
-        temp_storage,
-        d_rows,
-        d_columns,
-        d_values,
-        d_vector,
-        d_result,
-        d_block_partials,
-        even_share.block_offset,
-        even_share.block_end);
-
-    // Process input tiles
-    persistent_block.ProcessTiles();
-}
-
-
-/**
- * Kernel for "fixing up" an array of interblock SpMV partial products.
- */
-template <
-    int                             BLOCK_THREADS,
-    int                             ITEMS_PER_THREAD,
-    typename                        VertexId,
-    typename                        Value>
-__launch_bounds__ (BLOCK_THREADS,  1)
-__global__ void CooFinalizeKernel(
-    PartialProduct<VertexId, Value> *d_block_partials,
-    int                             num_partials,
-    Value                           *d_result)
-{
-    // Specialize "fix-up" thread block abstraction type
-    typedef FinalizeSpmvBlock<BLOCK_THREADS, ITEMS_PER_THREAD, VertexId, Value> FinalizeSpmvBlock;
-
-    // Shared memory allocation
-    __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage;
-
-    // Construct persistent thread block
-    FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials);
-
-    // Process input tiles
-    persistent_block.ProcessTiles();
-}
-
-
-
-//---------------------------------------------------------------------
-// Host subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Simple test of device
- */
-template <
-    int                         COO_BLOCK_THREADS,
-    int                         COO_ITEMS_PER_THREAD,
-    int                         COO_SUBSCRIPTION_FACTOR,
-    int                         FINALIZE_BLOCK_THREADS,
-    int                         FINALIZE_ITEMS_PER_THREAD,
-    typename                    VertexId,
-    typename                    Value>
-void TestDevice(
-    CooGraph<VertexId, Value>&  coo_graph,
-    Value*                      h_vector,
-    Value*                      h_reference)
-{
-    typedef PartialProduct<VertexId, Value> PartialProduct;
-
-    const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD;
-
-    // SOA device storage
-    VertexId        *d_rows;             // SOA graph row coordinates
-    VertexId        *d_columns;          // SOA graph col coordinates
-    Value           *d_values;           // SOA graph values
-    Value           *d_vector;           // Vector multiplicand
-    Value           *d_result;           // Output row
-    PartialProduct  *d_block_partials;   // Temporary storage for communicating dot product partials between thread blocks
-
-    // Create SOA version of coo_graph on host
-    int             num_edges   = coo_graph.coo_tuples.size();
-    VertexId        *h_rows     = new VertexId[num_edges];
-    VertexId        *h_columns  = new VertexId[num_edges];
-    Value           *h_values   = new Value[num_edges];
-    for (int i = 0; i < num_edges; i++)
-    {
-        h_rows[i]       = coo_graph.coo_tuples[i].row;
-        h_columns[i]    = coo_graph.coo_tuples[i].col;
-        h_values[i]     = coo_graph.coo_tuples[i].val;
-    }
-
-    // Get CUDA properties
-    Device device_props;
-    CubDebugExit(device_props.Init());
-
-    // Determine launch configuration from kernel properties
-    int coo_sm_occupancy;
-    CubDebugExit(device_props.MaxSmOccupancy(
-        coo_sm_occupancy,
-        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, VertexId, Value>,
-        COO_BLOCK_THREADS));
-    int max_coo_grid_size   = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR;
-
-    // Construct an even-share work distribution
-    GridEvenShare<int> even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE);
-    int coo_grid_size  = even_share.grid_size;
-    int num_partials   = coo_grid_size * 2;
-
-    // Allocate COO device arrays
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows,            sizeof(VertexId) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns,         sizeof(VertexId) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values,          sizeof(Value) * num_edges));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector,          sizeof(Value) * coo_graph.col_dim));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result,          sizeof(Value) * coo_graph.row_dim));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials,  sizeof(PartialProduct) * num_partials));
-
-    // Copy host arrays to device
-    CubDebugExit(cudaMemcpy(d_rows,     h_rows,     sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_columns,  h_columns,  sizeof(VertexId) * num_edges,       cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values,   h_values,   sizeof(Value) * num_edges,          cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_vector,   h_vector,   sizeof(Value) * coo_graph.col_dim,  cudaMemcpyHostToDevice));
-
-    // Bind textures
-    TexVector<Value>::BindTexture(d_vector, coo_graph.col_dim);
-
-    // Print debug info
-    printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n",
-        COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy);
-    if (coo_grid_size > 1)
-    {
-        printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS);
-    }
-    fflush(stdout);
-
-    CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
-
-    // Run kernel (always run one iteration without timing)
-    GpuTimer gpu_timer;
-    float elapsed_millis = 0.0;
-    for (int i = 0; i <= g_timing_iterations; i++)
-    {
-        gpu_timer.Start();
-
-        // Initialize output
-        CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value)));
-
-        // Run the COO kernel
-        CooKernel<COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD><<<coo_grid_size, COO_BLOCK_THREADS>>>(
-            even_share,
-            d_block_partials,
-            d_rows,
-            d_columns,
-            d_values,
-            d_vector,
-            d_result);
-
-        if (coo_grid_size > 1)
-        {
-            // Run the COO finalize kernel
-            CooFinalizeKernel<FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD><<<1, FINALIZE_BLOCK_THREADS>>>(
-                d_block_partials,
-                num_partials,
-                d_result);
-        }
-
-        gpu_timer.Stop();
-
-        if (i > 0)
-            elapsed_millis += gpu_timer.ElapsedMillis();
-    }
-
-    // Force any kernel stdio to screen
-    CubDebugExit(cudaThreadSynchronize());
-    fflush(stdout);
-
-    // Display timing
-    if (g_timing_iterations > 0)
-    {
-        float avg_elapsed = elapsed_millis / g_timing_iterations;
-        int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim);
-        printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n",
-            g_timing_iterations,
-            avg_elapsed,
-            total_bytes / avg_elapsed / 1000.0 / 1000.0,
-            num_edges * 2 / avg_elapsed / 1000.0 / 1000.0);
-    }
-
-    // Check results
-    int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    TexVector<Value>::UnbindTexture();
-    CubDebugExit(g_allocator.DeviceFree(d_block_partials));
-    CubDebugExit(g_allocator.DeviceFree(d_rows));
-    CubDebugExit(g_allocator.DeviceFree(d_columns));
-    CubDebugExit(g_allocator.DeviceFree(d_values));
-    CubDebugExit(g_allocator.DeviceFree(d_vector));
-    CubDebugExit(g_allocator.DeviceFree(d_result));
-    delete[] h_rows;
-    delete[] h_columns;
-    delete[] h_values;
-}
-
-
-/**
- * Compute reference answer on CPU
- */
-template <typename VertexId, typename Value>
-void ComputeReference(
-    CooGraph<VertexId, Value>&  coo_graph,
-    Value*                      h_vector,
-    Value*                      h_reference)
-{
-    for (VertexId i = 0; i < coo_graph.row_dim; i++)
-    {
-        h_reference[i] = 0.0;
-    }
-
-    for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++)
-    {
-        h_reference[coo_graph.coo_tuples[i].row] +=
-            coo_graph.coo_tuples[i].val *
-            h_vector[coo_graph.coo_tuples[i].col];
-    }
-}
-
-
-/**
- * Assign arbitrary values to vector items
- */
-template <typename Value>
-void AssignVectorValues(Value *vector, int col_dim)
-{
-    for (int i = 0; i < col_dim; i++)
-    {
-        vector[i] = 1.0;
-    }
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("i", g_timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s\n [--device=<device-id>] [--v] [--iterations=<test iterations>] [--grid-size=<grid-size>]\n"
-            "\t--type=wheel --spokes=<spokes>\n"
-            "\t--type=grid2d --width=<width> [--no-self-loops]\n"
-            "\t--type=grid3d --width=<width> [--no-self-loops]\n"
-            "\t--type=market --file=<file>\n"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get graph type
-    string type;
-    args.GetCmdLineArgument("type", type);
-
-    // Generate graph structure
-
-    CpuTimer timer;
-    timer.Start();
-    CooGraph<VertexId, Value> coo_graph;
-    if (type == string("grid2d"))
-    {
-        VertexId width;
-        args.GetCmdLineArgument("width", width);
-        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
-        printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout);
-        if (coo_graph.InitGrid2d(width, self_loops)) exit(1);
-    } else if (type == string("grid3d"))
-    {
-        VertexId width;
-        args.GetCmdLineArgument("width", width);
-        bool self_loops = !args.CheckCmdLineFlag("no-self-loops");
-        printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout);
-        if (coo_graph.InitGrid3d(width, self_loops)) exit(1);
-    }
-    else if (type == string("wheel"))
-    {
-        VertexId spokes;
-        args.GetCmdLineArgument("spokes", spokes);
-        printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout);
-        if (coo_graph.InitWheel(spokes)) exit(1);
-    }
-    else if (type == string("market"))
-    {
-        string filename;
-        args.GetCmdLineArgument("file", filename);
-        printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout);
-        if (coo_graph.InitMarket(filename)) exit(1);
-    }
-    else
-    {
-        printf("Unsupported graph type\n");
-        exit(1);
-    }
-    timer.Stop();
-    printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n",
-        timer.ElapsedMillis() / 1000.0,
-        coo_graph.coo_tuples.size(),
-        coo_graph.row_dim,
-        coo_graph.col_dim);
-    fflush(stdout);
-
-    if (g_verbose)
-    {
-        cout << coo_graph << "\n";
-    }
-
-    // Create vector
-    Value *h_vector = new Value[coo_graph.col_dim];
-    AssignVectorValues(h_vector, coo_graph.col_dim);
-    if (g_verbose)
-    {
-        printf("Vector[%d]: ", coo_graph.col_dim);
-        DisplayResults(h_vector, coo_graph.col_dim);
-        printf("\n\n");
-    }
-
-    // Compute reference answer
-    Value *h_reference = new Value[coo_graph.row_dim];
-    ComputeReference(coo_graph, h_vector, h_reference);
-    if (g_verbose)
-    {
-        printf("Results[%d]: ", coo_graph.row_dim);
-        DisplayResults(h_reference, coo_graph.row_dim);
-        printf("\n\n");
-    }
-
-    // Parameterization for SM35
-    enum
-    {
-        COO_BLOCK_THREADS           = 64,
-        COO_ITEMS_PER_THREAD        = 10,
-        COO_SUBSCRIPTION_FACTOR     = 4,
-        FINALIZE_BLOCK_THREADS      = 256,
-        FINALIZE_ITEMS_PER_THREAD   = 4,
-    };
-
-    // Run GPU version
-    TestDevice<
-        COO_BLOCK_THREADS,
-        COO_ITEMS_PER_THREAD,
-        COO_SUBSCRIPTION_FACTOR,
-        FINALIZE_BLOCK_THREADS,
-        FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference);
-
-    // Cleanup
-    delete[] h_vector;
-    delete[] h_reference;
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/experimental/defunct/test_device_seg_reduce.cu b/external/cub/experimental/defunct/test_device_seg_reduce.cu
deleted file mode 100644
index 20ef4764389..00000000000
--- a/external/cub/experimental/defunct/test_device_seg_reduce.cu
+++ /dev/null
@@ -1,2142 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * An implementation of segmented reduction using a load-balanced parallelization
- * strategy based on the MergePath decision path.
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <stdio.h>
-
-#include <cub/cub.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-/******************************************************************************
- * Globals, constants, and typedefs
- ******************************************************************************/
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 1;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/******************************************************************************
- * Utility routines
- ******************************************************************************/
-
-
-/**
- * An pair of index offsets
- */
-template <typename OffsetT>
-struct IndexPair
-{
-    OffsetT a_idx;
-    OffsetT b_idx;
-};
-
-
-/**
- * Computes the begin offsets into A and B for the specified
- * location (diagonal) along the merge decision path
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            IteratorA,
-    typename            IteratorB,
-    typename            OffsetT>
-__device__ __forceinline__ void ParallelMergePathSearch(
-    OffsetT             diagonal,
-    IteratorA           a,
-    IteratorB           b,
-    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
-    IndexPair<OffsetT>  end,            // End offsets into a and b
-    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
-{
-    OffsetT a_split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
-    OffsetT a_split_max = CUB_MIN(diagonal, end.a_idx);
-
-    while (a_split_min < a_split_max)
-    {
-        OffsetT a_distance       = a_split_max - a_split_min;
-        OffsetT a_slice          = (a_distance + BLOCK_THREADS - 1) >> Log2<BLOCK_THREADS>::VALUE;
-        OffsetT a_split_pivot    = CUB_MIN(a_split_min + (threadIdx.x * a_slice), end.a_idx - 1);
-
-        int move_up = (a[a_split_pivot] <= b[diagonal - a_split_pivot - 1]);
-        int num_up = __syncthreads_count(move_up);
-/*
-        _CubLog("a_split_min(%d), a_split_max(%d) a_distance(%d), a_slice(%d), a_split_pivot(%d), move_up(%d), num_up(%d), a_begin(%d), a_end(%d)\n",
-            a_split_min, a_split_max, a_distance, a_slice, a_split_pivot, move_up, num_up, a_begin, a_end);
-*/
-        a_split_max = CUB_MIN(num_up * a_slice, end.a_idx);
-        a_split_min = CUB_MAX(a_split_max - a_slice, begin.a_idx) + 1;
-    }
-
-    intersection.a_idx = CUB_MIN(a_split_min, end.a_idx);
-    intersection.b_idx = CUB_MIN(diagonal - a_split_min, end.b_idx);
-}
-
-/**
- * Computes the begin offsets into A and B for the specified
- * location (diagonal) along the merge decision path
- */
-template <
-    typename            IteratorA,
-    typename            IteratorB,
-    typename            OffsetT>
-__device__ __forceinline__ void MergePathSearch(
-    OffsetT             diagonal,
-    IteratorA           a,
-    IteratorB           b,
-    IndexPair<OffsetT>  begin,          // Begin offsets into a and b
-    IndexPair<OffsetT>  end,            // End offsets into a and b
-    IndexPair<OffsetT>  &intersection)  // [out] Intersection offsets into a and b
-{
-    OffsetT split_min = CUB_MAX(diagonal - end.b_idx, begin.a_idx);
-    OffsetT split_max = CUB_MIN(diagonal, end.a_idx);
-
-    while (split_min < split_max)
-    {
-        OffsetT split_pivot = (split_min + split_max) >> 1;
-        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
-        {
-            // Move candidate split range up A, down B
-            split_min = split_pivot + 1;
-        }
-        else
-        {
-            // Move candidate split range up B, down A
-            split_max = split_pivot;
-        }
-    }
-
-    intersection.a_idx = CUB_MIN(split_min, end.a_idx);
-    intersection.b_idx = CUB_MIN(diagonal - split_min, end.b_idx);
-}
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockSegReduceRegion
- */
-template <
-    int                     _BLOCK_THREADS,             ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    bool                    _USE_SMEM_SEGMENT_CACHE,    ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-    bool                    _USE_SMEM_VALUE_CACHE,      ///< Whether or not to cache incoming values in shared memory before reducing each tile
-    CacheLoadModifier       _LOAD_MODIFIER_SEGMENTS,    ///< Cache load modifier for reading segment offsets
-    CacheLoadModifier       _LOAD_MODIFIER_VALUES,      ///< Cache load modifier for reading values
-    BlockReduceAlgorithm    _REDUCE_ALGORITHM,          ///< The BlockReduce algorithm to use
-    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
-struct BlockSegReduceRegionPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        USE_SMEM_SEGMENT_CACHE  = _USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-        USE_SMEM_VALUE_CACHE    = _USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
-    };
-
-    static const CacheLoadModifier      LOAD_MODIFIER_SEGMENTS  = _LOAD_MODIFIER_SEGMENTS;  ///< Cache load modifier for reading segment offsets
-    static const CacheLoadModifier      LOAD_MODIFIER_VALUES    = _LOAD_MODIFIER_VALUES;    ///< Cache load modifier for reading values
-    static const BlockReduceAlgorithm   REDUCE_ALGORITHM        = _REDUCE_ALGORITHM;        ///< The BlockReduce algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * \brief BlockSegReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide segmented reduction.
- */
-template <
-    typename BlockSegReduceRegionPolicy,    ///< Parameterized BlockSegReduceRegionPolicy tuning policy
-    typename SegmentOffsetIterator,         ///< Random-access input iterator type for reading segment end-offsets
-    typename ValueIterator,                 ///< Random-access input iterator type for reading values
-    typename OutputIteratorT,               ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT>                       ///< Signed integer type for global offsets
-struct BlockSegReduceRegion
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockSegReduceRegionPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockSegReduceRegionPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,                     /// Number of work items to be processed per tile
-
-        USE_SMEM_SEGMENT_CACHE  = BlockSegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE,      ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-        USE_SMEM_VALUE_CACHE    = BlockSegReduceRegionPolicy::USE_SMEM_VALUE_CACHE,        ///< Whether or not to cache incoming upcoming values in shared memory before reducing each tile
-
-        SMEM_SEGMENT_CACHE_ITEMS    = USE_SMEM_SEGMENT_CACHE ? TILE_ITEMS : 1,
-        SMEM_VALUE_CACHE_ITEMS      = USE_SMEM_VALUE_CACHE ? TILE_ITEMS : 1,
-    };
-
-    // Segment offset type
-    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
-
-    // Value type
-    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-    // Counting iterator type
-    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
-
-    // Segment offsets iterator wrapper type
-    typedef typename If<(IsPointer<SegmentOffsetIterator>::VALUE),
-            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS, SegmentOffsetT, OffsetT>,  // Wrap the native input pointer with CacheModifiedInputIterator
-            SegmentOffsetIterator>::Type                                                                            // Directly use the supplied input iterator type
-        WrappedSegmentOffsetIterator;
-
-    // Values iterator wrapper type
-    typedef typename If<(IsPointer<ValueIterator>::VALUE),
-            CacheModifiedInputIterator<BlockSegReduceRegionPolicy::LOAD_MODIFIER_VALUES, Value, OffsetT>,        // Wrap the native input pointer with CacheModifiedInputIterator
-            ValueIterator>::Type                                                                                // Directly use the supplied input iterator type
-        WrappedValueIterator;
-
-    // Tail flag type for marking segment discontinuities
-    typedef int TailFlag;
-
-    // Reduce-by-key data type tuple (segment-ID, value)
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Index pair data type
-    typedef IndexPair<OffsetT> IndexPair;
-
-    // BlockScan scan operator for reduction-by-segment
-    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef RunningBlockPrefixCallbackOp<
-            KeyValuePair,
-            ReduceByKeyOp>
-        RunningPrefixCallbackOp;
-
-    // Parameterized BlockShift type for exchanging index pairs
-    typedef BlockShift<
-            IndexPair,
-            BLOCK_THREADS>
-        BlockShift;
-
-    // Parameterized BlockReduce type for block-wide reduction
-    typedef BlockReduce<
-            Value,
-            BLOCK_THREADS,
-            BlockSegReduceRegionPolicy::REDUCE_ALGORITHM>
-        BlockReduce;
-
-    // Parameterized BlockScan type for block-wide reduce-value-by-key
-    typedef BlockScan<
-            KeyValuePair,
-            BLOCK_THREADS,
-            BlockSegReduceRegionPolicy::SCAN_ALGORITHM>
-        BlockScan;
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        union
-        {
-            // Smem needed for BlockScan
-            typename BlockScan::TempStorage scan;
-
-            // Smem needed for BlockReduce
-            typename BlockReduce::TempStorage reduce;
-
-            struct
-            {
-                // Smem needed for communicating start/end indices between threads for a given work tile
-                typename BlockShift::TempStorage shift;
-
-                // Smem needed for caching segment end-offsets
-                SegmentOffset cached_segment_end_offsets[SMEM_SEGMENT_CACHE_ITEMS + 1];
-            };
-
-            // Smem needed for caching values
-            Value cached_values[SMEM_VALUE_CACHE_ITEMS];
-        };
-
-        IndexPair block_region_idx[2];      // The starting [0] and ending [1] pairs of segment and value indices for the thread block's region
-
-        // The first partial reduction tuple scattered by this thread block
-        KeyValuePair first_tuple;
-    };
-
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;          ///< Reference to shared storage
-    WrappedSegmentOffsetIterator    d_segment_end_offsets;  ///< A sequence of \p num_segments segment end-offsets
-    WrappedValueIterator            d_values;               ///< A sequence of \p num_values data to reduce
-    OutputIteratorT                  d_output;               ///< A sequence of \p num_segments segment totals
-    CountingIterator                d_value_offsets;        ///< A sequence of \p num_values value-offsets
-    IndexPair                       *d_block_idx;
-    OffsetT                         num_values;             ///< Total number of values to reduce
-    OffsetT                         num_segments;           ///< Number of segments being reduced
-    Value                           identity;               ///< Identity value (for zero-length segments)
-    ReductionOp                     reduction_op;           ///< Reduction operator
-    ReduceByKeyOp                   scan_op;                ///< Reduce-by-key scan operator
-    RunningPrefixCallbackOp         prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    BlockSegReduceRegion(
-        TempStorage             &temp_storage,          ///< Reference to shared storage
-        SegmentOffsetIterator   d_segment_end_offsets,  ///< A sequence of \p num_segments segment end-offsets
-        ValueIterator           d_values,               ///< A sequence of \p num_values values
-        OutputIteratorT          d_output,               ///< A sequence of \p num_segments segment totals
-        IndexPair               *d_block_idx,
-        OffsetT                 num_values,             ///< Number of values to reduce
-        OffsetT                 num_segments,           ///< Number of segments being reduced
-        Value                   identity,               ///< Identity value (for zero-length segments)
-        ReductionOp             reduction_op)           ///< Reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_segment_end_offsets(d_segment_end_offsets),
-        d_values(d_values),
-        d_value_offsets(0),
-        d_output(d_output),
-        d_block_idx(d_block_idx),
-        num_values(num_values),
-        num_segments(num_segments),
-        identity(identity),
-        reduction_op(reduction_op),
-        scan_op(reduction_op),
-        prefix_op(scan_op)
-    {}
-
-
-    /**
-     * Fast-path single-segment tile reduction.  Perform a
-     * simple block-wide reduction and accumulate the result into
-     * the running total.
-     */
-    __device__ __forceinline__ void SingleSegmentTile(
-        IndexPair next_tile_idx,
-        IndexPair block_idx)
-    {
-        OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
-
-        // Load a tile's worth of values (using identity for out-of-bounds items)
-        Value values[ITEMS_PER_THREAD];
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
-
-        // Barrier for smem reuse
-        __syncthreads();
-
-        // Reduce the tile of values and update the running total in thread-0
-        KeyValuePair tile_aggregate;
-        tile_aggregate.key      = block_idx.a_idx;
-        tile_aggregate.value    = BlockReduce(temp_storage.reduce).Reduce(values, reduction_op);
-
-        if (threadIdx.x == 0)
-        {
-            prefix_op.running_total = scan_op(prefix_op.running_total, tile_aggregate);
-        }
-    }
-
-    /**
-     * Fast-path empty-segment tile reduction.  Write out a tile of identity
-     * values to output.
-     */
-    __device__ __forceinline__ void EmptySegmentsTile(
-        IndexPair next_tile_idx,
-        IndexPair block_idx)
-    {
-        Value segment_reductions[ITEMS_PER_THREAD];
-
-        if (threadIdx.x == 0)
-        {
-            // The first segment gets the running segment total
-            segment_reductions[0] = prefix_op.running_total.value;
-
-            // Update the running prefix
-            prefix_op.running_total.value = identity;
-            prefix_op.running_total.key = next_tile_idx.a_idx;
-        }
-        else
-        {
-            // Remainder of segments in this tile get identity
-            segment_reductions[0] = identity;
-        }
-
-        // Remainder of segments in this tile get identity
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            segment_reductions[ITEM] = identity;
-
-        // Store reductions
-        OffsetT tile_segments = next_tile_idx.a_idx - block_idx.a_idx;
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_output + block_idx.a_idx, segment_reductions, tile_segments);
-    }
-
-
-    /**
-     * Multi-segment tile reduction.
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void MultiSegmentTile(
-        IndexPair block_idx,
-        IndexPair thread_idx,
-        IndexPair next_thread_idx,
-        IndexPair next_tile_idx)
-    {
-        IndexPair local_thread_idx;
-        local_thread_idx.a_idx = thread_idx.a_idx - block_idx.a_idx;
-        local_thread_idx.b_idx = thread_idx.b_idx - block_idx.b_idx;
-
-        // Check if first segment end-offset is in range
-        bool valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
-
-        // Check if first value offset is in range
-        bool valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
-
-        // Load first segment end-offset
-        OffsetT segment_end_offset = (valid_segment) ?
-            (USE_SMEM_SEGMENT_CACHE)?
-                temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx] :
-                d_segment_end_offsets[thread_idx.a_idx] :
-            -1;
-
-        OffsetT segment_ids[ITEMS_PER_THREAD];
-        OffsetT value_offsets[ITEMS_PER_THREAD];
-
-        KeyValuePair first_partial;
-        first_partial.key    = thread_idx.a_idx;
-        first_partial.value  = identity;
-
-        // Get segment IDs and gather-offsets for values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            segment_ids[ITEM]   = -1;
-            value_offsets[ITEM] = -1;
-
-            // Whether or not we slide (a) right along the segment path or (b) down the value path
-            if (valid_segment && (!valid_value || (segment_end_offset <= thread_idx.b_idx)))
-            {
-                // Consume this segment index
-                segment_ids[ITEM] = thread_idx.a_idx;
-                thread_idx.a_idx++;
-                local_thread_idx.a_idx++;
-
-                valid_segment = FULL_TILE || (thread_idx.a_idx < next_thread_idx.a_idx);
-
-                // Read next segment end-offset (if valid)
-                if (valid_segment)
-                {
-                    if (USE_SMEM_SEGMENT_CACHE)
-                        segment_end_offset = temp_storage.cached_segment_end_offsets[local_thread_idx.a_idx];
-                    else
-                        segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
-                }
-            }
-            else if (valid_value)
-            {
-                // Consume this value index
-                value_offsets[ITEM] = thread_idx.b_idx;
-                thread_idx.b_idx++;
-                local_thread_idx.b_idx++;
-
-                valid_value = FULL_TILE || (thread_idx.b_idx < next_thread_idx.b_idx);
-            }
-        }
-
-        // Load values
-        Value values[ITEMS_PER_THREAD];
-
-        if (USE_SMEM_VALUE_CACHE)
-        {
-            // Barrier for smem reuse
-            __syncthreads();
-
-            OffsetT tile_values = next_tile_idx.b_idx - block_idx.b_idx;
-
-            // Load a tile's worth of values (using identity for out-of-bounds items)
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values + block_idx.b_idx, values, tile_values, identity);
-
-            // Store to shared
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_values, values, tile_values);
-
-            // Barrier for smem reuse
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                values[ITEM] = (value_offsets[ITEM] == -1) ?
-                    identity :
-                    temp_storage.cached_values[value_offsets[ITEM] - block_idx.b_idx];
-            }
-        }
-        else
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                values[ITEM] = (value_offsets[ITEM] == -1) ?
-                    identity :
-                    d_values[value_offsets[ITEM]];
-            }
-        }
-
-        // Reduce within thread segments
-        KeyValuePair running_total = first_partial;
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_ids[ITEM] != -1)
-            {
-                // Consume this segment index
-                d_output[segment_ids[ITEM]] = running_total.value;
-
-//                _CubLog("Updating segment %d with value %lld\n", segment_ids[ITEM], running_total.value)
-
-                if (first_partial.key == segment_ids[ITEM])
-                    first_partial.value = running_total.value;
-
-                running_total.key    = segment_ids[ITEM];
-                running_total.value  = identity;
-            }
-
-            running_total.value = reduction_op(running_total.value, values[ITEM]);
-        }
-/*
-
-        // Barrier for smem reuse
-        __syncthreads();
-
-        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).InclusiveScan(
-            pairs,                          // Scan input
-            pairs,                          // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-*/
-
-/*
-        // Check if first segment end-offset is in range
-        bool valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx);
-
-        // Check if first value offset is in range
-        bool valid_value = (thread_idx.b_idx < next_thread_idx.b_idx);
-
-        // Load first segment end-offset
-        OffsetT segment_end_offset = (valid_segment) ?
-            d_segment_end_offsets[thread_idx.a_idx] :
-            num_values;                                                     // Out of range (the last segment end-offset is one-past the last value offset)
-
-        // Load first value offset
-        OffsetT value_offset = (valid_value) ?
-            d_value_offsets[thread_idx.b_idx] :
-            num_values;                                                     // Out of range (one-past the last value offset)
-
-        // Assemble segment-demarcating tail flags and partial reduction tuples
-        TailFlag        tail_flags[ITEMS_PER_THREAD];
-        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Default tuple and flag values
-            partial_reductions[ITEM].key    = thread_idx.a_idx;
-            partial_reductions[ITEM].value  = identity;
-            tail_flags[ITEM]                = 0;
-
-            // Whether or not we slide (a) right along the segment path or (b) down the value path
-            if (valid_segment && (!valid_value || (segment_end_offset <= value_offset)))
-            {
-                // Consume this segment index
-
-                // Set tail flag noting the end of the segment
-                tail_flags[ITEM] = 1;
-
-                // Increment segment index
-                thread_idx.a_idx++;
-
-                // Read next segment end-offset (if valid)
-                if ((valid_segment = (thread_idx.a_idx < next_thread_idx.a_idx)))
-                    segment_end_offset = d_segment_end_offsets[thread_idx.a_idx];
-            }
-            else if (valid_value)
-            {
-                // Consume this value index
-
-                // Update the tuple's value with the value at this index.
-                partial_reductions[ITEM].value = d_values[value_offset];
-
-                // Increment value index
-                thread_idx.b_idx++;
-
-                // Read next value offset (if valid)
-                if ((valid_value = (thread_idx.b_idx < next_thread_idx.b_idx)))
-                    value_offset = d_value_offsets[thread_idx.b_idx];
-            }
-        }
-
-        // Use prefix scan to reduce values by segment-id.  The segment-reductions end up in items flagged as segment-tails.
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).InclusiveScan(
-            partial_reductions,             // Scan input
-            partial_reductions,             // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // The first segment index for this region (hoist?)
-        OffsetT first_segment_idx = temp_storage.block_idx.a_idx[0];
-
-        // Scatter an accumulated reduction if it is the head of a valid segment
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (tail_flags[ITEM])
-            {
-                OffsetT segment_idx = partial_reductions[ITEM].key;
-                Value   value       = partial_reductions[ITEM].value;
-
-                // Write value reduction to corresponding segment id
-                d_output[segment_idx] = value;
-
-                // Save off the first value product that this thread block will scatter
-                if (segment_idx == first_segment_idx)
-                {
-                    temp_storage.first_tuple.value = value;
-                }
-            }
-        }
-*/
-    }
-
-
-
-    /**
-     * Have the thread block process the specified region of the MergePath decision path
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT         block_diagonal,
-        OffsetT         next_block_diagonal,
-        KeyValuePair    &first_tuple,       // [Out] Valid in thread-0
-        KeyValuePair    &last_tuple)        // [Out] Valid in thread-0
-    {
-        // Thread block initialization
-        if (threadIdx.x < 2)
-        {
-            // Retrieve block starting and ending indices
-            IndexPair block_idx = {0, 0};
-            if (gridDim.x > 1)
-            {
-                block_idx = d_block_idx[blockIdx.x + threadIdx.x];
-            }
-            else if (threadIdx.x > 0)
-            {
-                block_idx.a_idx = num_segments;
-                block_idx.b_idx = num_values;
-            }
-
-            // Share block starting and ending indices
-            temp_storage.block_region_idx[threadIdx.x] = block_idx;
-
-            // Initialize the block's running prefix
-            if (threadIdx.x == 0)
-            {
-                prefix_op.running_total.key    = block_idx.a_idx;
-                prefix_op.running_total.value  = identity;
-
-                // Initialize the "first scattered partial reduction tuple" to the prefix tuple (in case we don't actually scatter one)
-                temp_storage.first_tuple = prefix_op.running_total;
-            }
-        }
-
-        // Ensure coherence of region indices
-        __syncthreads();
-
-        // Read block's starting indices
-        IndexPair block_idx = temp_storage.block_region_idx[0];
-
-        // Have the thread block iterate over the region
-        #pragma unroll 1
-        while (block_diagonal < next_block_diagonal)
-        {
-            // Read block's ending indices (hoist?)
-            IndexPair next_block_idx = temp_storage.block_region_idx[1];
-
-            // Clamp the per-thread search range to within one work-tile of block's current indices
-            IndexPair next_tile_idx;
-            next_tile_idx.a_idx = CUB_MIN(next_block_idx.a_idx, block_idx.a_idx + TILE_ITEMS);
-            next_tile_idx.b_idx = CUB_MIN(next_block_idx.b_idx, block_idx.b_idx + TILE_ITEMS);
-
-            // Have each thread search for the end-indices of its subranges within the segment and value inputs
-            IndexPair next_thread_idx;
-            if (USE_SMEM_SEGMENT_CACHE)
-            {
-                // Search in smem cache
-                OffsetT num_segments = next_tile_idx.a_idx - block_idx.a_idx;
-
-                // Load global
-                SegmentOffset segment_offsets[ITEMS_PER_THREAD];
-                LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_segment_end_offsets + block_idx.a_idx, segment_offsets, num_segments, num_values);
-
-                // Store to shared
-                StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, temp_storage.cached_segment_end_offsets, segment_offsets);
-
-                __syncthreads();
-
-                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
-
-                MergePathSearch(
-                    next_thread_diagonal,                       // Next thread diagonal
-                    temp_storage.cached_segment_end_offsets - block_idx.a_idx,                      // A (segment end-offsets)
-                    d_value_offsets,                            // B (value offsets)
-                    block_idx,                                  // Start indices into A and B
-                    next_tile_idx,                              // End indices into A and B
-                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
-            }
-            else
-            {
-                // Search in global
-
-                OffsetT next_thread_diagonal = block_diagonal + ((threadIdx.x + 1) * ITEMS_PER_THREAD);
-
-                MergePathSearch(
-                    next_thread_diagonal,                       // Next thread diagonal
-                    d_segment_end_offsets,                      // A (segment end-offsets)
-                    d_value_offsets,                            // B (value offsets)
-                    block_idx,                                  // Start indices into A and B
-                    next_tile_idx,                              // End indices into A and B
-                    next_thread_idx);                           // [out] diagonal intersection indices into A and B
-            }
-
-            // Share thread end-indices to get thread begin-indices and tile end-indices
-            IndexPair thread_idx;
-
-            BlockShift(temp_storage.shift).Up(
-                next_thread_idx,    // Input item
-                thread_idx,         // [out] Output item
-                block_idx,          // Prefix item to be provided to <em>thread</em><sub>0</sub>
-                next_tile_idx);     // [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
-
-//            if (block_idx.a_idx == next_tile_idx.a_idx)
-//            {
-//                // There are no segment end-offsets in this tile.  Perform a
-//                // simple block-wide reduction and accumulate the result into
-//                // the running total.
-//                SingleSegmentTile(next_tile_idx, block_idx);
-//            }
-//          else if (block_idx.b_idx == next_tile_idx.b_idx)
-//            {
-//                // There are no values in this tile (only empty segments).
-//                EmptySegmentsTile(next_tile_idx.a_idx, block_idx.a_idx);
-//            }
-//            else
-            if ((next_tile_idx.a_idx < num_segments) && (next_tile_idx.b_idx < num_values))
-            {
-                // Merge the tile's segment and value indices (full tile)
-                MultiSegmentTile<true>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
-            }
-            else
-            {
-                // Merge the tile's segment and value indices (partially full tile)
-                MultiSegmentTile<false>(block_idx, thread_idx, next_thread_idx, next_tile_idx);
-            }
-
-            // Advance the block's indices in preparation for the next tile
-            block_idx = next_tile_idx;
-
-            // Advance to the next region in the decision path
-            block_diagonal += TILE_ITEMS;
-
-            // Barrier for smem reuse
-            __syncthreads();
-        }
-
-        // Get first and last tuples for the region
-        if (threadIdx.x == 0)
-        {
-            first_tuple = temp_storage.first_tuple;
-            last_tuple = prefix_op.running_total;
-        }
-
-    }
-
-
-};
-
-
-
-
-
-
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockSegReduceRegionByKey
- */
-template <
-    int                     _BLOCK_THREADS,             ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm      _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
-    bool                    _LOAD_WARP_TIME_SLICING,    ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-    CacheLoadModifier       _LOAD_MODIFIER,             ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm      _SCAN_ALGORITHM>            ///< The BlockScan algorithm to use
-struct BlockSegReduceRegionByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)    };
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Persistent thread block types
- ******************************************************************************/
-
-/**
- * \brief BlockSegReduceRegionByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-template <
-    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
-    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
-    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
-    typename    ReductionOp>                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct BlockSegReduceRegionByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockSegReduceRegionByKeyPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // KeyValuePair input type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type KeyValuePair;
-
-    // Signed integer type for global offsets
-    typedef typename KeyValuePair::Key OffsetT;
-
-    // Value type
-    typedef typename KeyValuePair::Value Value;
-
-    // Head flag type
-    typedef int HeadFlag;
-
-    // Input iterator wrapper type for loading KeyValuePair elements through cache
-    typedef CacheModifiedInputIterator<
-            BlockSegReduceRegionByKeyPolicy::LOAD_MODIFIER,
-            KeyValuePair,
-            OffsetT>
-        WrappedInputIteratorT;
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            WrappedInputIteratorT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            BlockSegReduceRegionByKeyPolicy::LOAD_ALGORITHM,
-            BlockSegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING>
-        BlockLoad;
-
-    // BlockScan scan operator for reduction-by-segment
-    typedef ReduceByKeyOp<ReductionOp> ReduceByKeyOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef RunningBlockPrefixCallbackOp<
-            KeyValuePair,
-            ReduceByKeyOp>
-        RunningPrefixCallbackOp;
-
-    // Parameterized BlockScan type for block-wide reduce-value-by-key
-    typedef BlockScan<
-            KeyValuePair,
-            BLOCK_THREADS,
-            BlockSegReduceRegionByKeyPolicy::SCAN_ALGORITHM>
-        BlockScan;
-
-    // Parameterized BlockDiscontinuity type for identifying key discontinuities
-    typedef BlockDiscontinuity<
-            OffsetT,
-            BLOCK_THREADS>
-        BlockDiscontinuity;
-
-    // Operator for detecting discontinuities in a list of segment identifiers.
-    struct NewSegmentOp
-    {
-        /// Returns true if row_b is the start of a new row
-        __device__ __forceinline__ bool operator()(const OffsetT& b, const OffsetT& a)
-        {
-            return (a != b);
-        }
-    };
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoad::TempStorage                 load;           // Smem needed for tile loading
-            struct {
-                typename BlockScan::TempStorage             scan;           // Smem needed for reduce-value-by-segment scan
-                typename BlockDiscontinuity::TempStorage    discontinuity;  // Smem needed for head-flagging
-            };
-        };
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;          ///< Reference to shared storage
-    WrappedInputIteratorT       d_tuple_partials;       ///< A sequence of partial reduction tuples to scan
-    OutputIteratorT              d_output;               ///< A sequence of segment totals
-    Value                       identity;               ///< Identity value (for zero-length segments)
-    ReduceByKeyOp               scan_op;                ///< Reduce-by-key scan operator
-    RunningPrefixCallbackOp     prefix_op;              ///< Stateful running total for block-wide prefix scan of partial reduction tuples
-
-
-    //---------------------------------------------------------------------
-    // Operations
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__
-    BlockSegReduceRegionByKey(
-        TempStorage             &temp_storage,          ///< Reference to shared storage
-        InputIteratorT          d_tuple_partials,       ///< A sequence of partial reduction tuples to scan
-        OutputIteratorT          d_output,               ///< A sequence of segment totals
-        Value                   identity,               ///< Identity value (for zero-length segments)
-        ReductionOp             reduction_op)           ///< Reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_tuple_partials(d_tuple_partials),
-        d_output(d_output),
-        identity(identity),
-        scan_op(reduction_op),
-        prefix_op(scan_op)
-    {}
-
-
-
-    /**
-     * Processes a reduce-value-by-key input tile, outputting reductions for each segment
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__
-    void ProcessTile(
-        OffsetT block_offset,
-        OffsetT first_segment_idx,
-        OffsetT last_segment_idx,
-        int guarded_items = TILE_ITEMS)
-    {
-        KeyValuePair    partial_reductions[ITEMS_PER_THREAD];
-        OffsetT         segment_ids[ITEMS_PER_THREAD];
-        HeadFlag        head_flags[ITEMS_PER_THREAD];
-
-        // Load a tile of block partials from previous kernel
-        if (FULL_TILE)
-        {
-            // Full tile
-            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions);
-        }
-        else
-        {
-            KeyValuePair oob_default;
-            oob_default.key    = last_segment_idx;       // The last segment ID to be reduced
-            oob_default.value  = identity;
-
-            // Partially-full tile
-            BlockLoad(temp_storage.load).Load(d_tuple_partials + block_offset, partial_reductions, guarded_items, oob_default);
-        }
-
-        // Barrier for shared memory reuse
-        __syncthreads();
-
-        // Copy the segment IDs for head-flagging
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            segment_ids[ITEM] = partial_reductions[ITEM].key;
-        }
-
-        // FlagT segment heads by looking for discontinuities
-        BlockDiscontinuity(temp_storage.discontinuity).FlagHeads(
-            head_flags,                         // [out] Head flags
-            segment_ids,                        // Segment ids
-            NewSegmentOp(),                     // Functor for detecting start of new rows
-            prefix_op.running_total.key);       // Last segment ID from previous tile to compare with first segment ID in this tile
-
-        // Reduce-value-by-segment across partial_reductions using exclusive prefix scan
-        KeyValuePair block_aggregate;
-        BlockScan(temp_storage.scan).ExclusiveScan(
-            partial_reductions,                   // Scan input
-            partial_reductions,                   // Scan output
-            scan_op,                        // Scan operator
-            block_aggregate,                // Block-wide total (unused)
-            prefix_op);                     // Prefix operator for seeding the block-wide scan with the running total
-
-        // Scatter an accumulated reduction if it is the head of a valid segment
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (head_flags[ITEM])
-            {
-                d_output[partial_reductions[ITEM].key] = partial_reductions[ITEM].value;
-            }
-        }
-    }
-
-
-    /**
-     * Iterate over input tiles belonging to this thread block
-     */
-    __device__ __forceinline__
-    void ProcessRegion(
-        OffsetT block_offset,
-        OffsetT block_end,
-        OffsetT first_segment_idx,
-        OffsetT last_segment_idx)
-    {
-        if (threadIdx.x == 0)
-        {
-            // Initialize running prefix to the first segment index paired with identity
-            prefix_op.running_total.key    = first_segment_idx;
-            prefix_op.running_total.value  = identity;
-        }
-
-        // Process full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessTile<true>(block_offset, first_segment_idx, last_segment_idx);
-            __syncthreads();
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process final value tile (if present)
-        int guarded_items = block_end - block_offset;
-        if (guarded_items)
-        {
-            ProcessTile<false>(block_offset, first_segment_idx, last_segment_idx, guarded_items);
-        }
-    }
-};
-
-
-
-/******************************************************************************
- * Kernel entrypoints
- ******************************************************************************/
-
-/**
- * Segmented reduce region kernel entry point (multi-block).
- */
-
-template <
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename OffsetT>                           ///< Signed integer type for global offsets
-__global__ void SegReducePartitionKernel(
-    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-    IndexPair<OffsetT>          *d_block_idx,
-    int                         num_partition_samples,
-    OffsetT                     num_values,             ///< [in] Number of values to reduce
-    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
-    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    // Segment offset type
-    typedef typename std::iterator_traits<SegmentOffsetIterator>::value_type SegmentOffset;
-
-    // Counting iterator type
-    typedef CountingInputIterator<SegmentOffsetT, OffsetT> CountingIterator;
-
-    // Cache-modified iterator for segment end-offsets
-    CacheModifiedInputIterator<LOAD_LDG, SegmentOffsetT, OffsetT> d_wrapped_segment_end_offsets(d_segment_end_offsets);
-
-    // Counting iterator for value offsets
-    CountingIterator d_value_offsets(0);
-
-    // Initialize even-share to tell us where to start and stop our tile-processing
-    int partition_id = (blockDim.x * blockIdx.x) + threadIdx.x;
-    even_share.Init(partition_id);
-
-    // Search for block starting and ending indices
-    IndexPair<OffsetT> start_idx = {0, 0};
-    IndexPair<OffsetT> end_idx   = {num_segments, num_values};
-    IndexPair<OffsetT> block_idx;
-
-    MergePathSearch(
-        even_share.block_offset,            // Next thread diagonal
-        d_wrapped_segment_end_offsets,      // A (segment end-offsets)
-        d_value_offsets,                    // B (value offsets)
-        start_idx,                          // Start indices into A and B
-        end_idx,                            // End indices into A and B
-        block_idx);                         // [out] diagonal intersection indices into A and B
-
-    // Write output
-    if (partition_id < num_partition_samples)
-    {
-        d_block_idx[partition_id] = block_idx;
-    }
-}
-
-
-/**
- * Segmented reduce region kernel entry point (multi-block).
- */
-template <
-    typename BlockSegReduceRegionPolicy,        ///< Parameterized BlockSegReduceRegionPolicy tuning policy
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename ValueIterator,                     ///< Random-access input iterator type for reading values
-    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT,                           ///< Signed integer type for global offsets
-    typename Value>                             ///< Value type
-__launch_bounds__ (BlockSegReduceRegionPolicy::BLOCK_THREADS)
-__global__ void SegReduceRegionKernel(
-    SegmentOffsetIterator       d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-    ValueIterator               d_values,               ///< [in] A sequence of \p num_values values
-    OutputIteratorT              d_output,               ///< [out] A sequence of \p num_segments segment totals
-    KeyValuePair<OffsetT, Value> *d_tuple_partials,      ///< [out] A sequence of (gridDim.x * 2) partial reduction tuples
-    IndexPair<OffsetT>          *d_block_idx,
-    OffsetT                     num_values,             ///< [in] Number of values to reduce
-    OffsetT                     num_segments,           ///< [in] Number of segments being reduced
-    Value                       identity,               ///< [in] Identity value (for zero-length segments)
-    ReductionOp                 reduction_op,           ///< [in] Reduction operator
-    GridEvenShare<OffsetT>      even_share)             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Specialize thread block abstraction type for reducing a range of segmented values
-    typedef BlockSegReduceRegion<
-            BlockSegReduceRegionPolicy,
-            SegmentOffsetIterator,
-            ValueIterator,
-            OutputIteratorT,
-            ReductionOp,
-            OffsetT>
-        BlockSegReduceRegion;
-
-    // Shared memory allocation
-    __shared__ typename BlockSegReduceRegion::TempStorage temp_storage;
-
-    // Initialize thread block even-share to tell us where to start and stop our tile-processing
-    even_share.BlockInit();
-
-    // Construct persistent thread block
-    BlockSegReduceRegion thread_block(
-        temp_storage,
-        d_segment_end_offsets,
-        d_values,
-        d_output,
-        d_block_idx,
-        num_values,
-        num_segments,
-        identity,
-        reduction_op);
-
-    // First and last partial reduction tuples within the range (valid in thread-0)
-    KeyValuePair first_tuple, last_tuple;
-
-    // Consume block's region of work
-    thread_block.ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end,
-        first_tuple,
-        last_tuple);
-
-    if (threadIdx.x == 0)
-    {
-        if (gridDim.x > 1)
-        {
-            // Special case where the first segment written and the carry-out are for the same segment
-            if (first_tuple.key == last_tuple.key)
-            {
-                first_tuple.value = identity;
-            }
-
-            // Write the first and last partial products from this thread block so
-            // that they can be subsequently "fixed up" in the next kernel.
-            d_tuple_partials[blockIdx.x * 2]          = first_tuple;
-            d_tuple_partials[(blockIdx.x * 2) + 1]    = last_tuple;
-        }
-    }
-
-}
-
-
-/**
- * Segmented reduce region kernel entry point (single-block).
- */
-template <
-    typename    BlockSegReduceRegionByKeyPolicy,        ///< Parameterized BlockSegReduceRegionByKeyPolicy tuning policy
-    typename    InputIteratorT,                         ///< Random-access iterator referencing key-value input tuples
-    typename    OutputIteratorT,                        ///< Random-access iterator referencing segment output totals
-    typename    ReductionOp,                            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename    OffsetT,                                ///< Signed integer type for global offsets
-    typename    Value>                                  ///< Value type
-__launch_bounds__ (BlockSegReduceRegionByKeyPolicy::BLOCK_THREADS, 1)
-__global__ void SegReduceRegionByKeyKernel(
-    InputIteratorT          d_tuple_partials,           ///< [in] A sequence of partial reduction tuples
-    OutputIteratorT          d_output,                   ///< [out] A sequence of \p num_segments segment totals
-    OffsetT                 num_segments,               ///< [in] Number of segments in the \p d_output sequence
-    int                     num_tuple_partials,         ///< [in] Number of partial reduction tuples being reduced
-    Value                   identity,                   ///< [in] Identity value (for zero-length segments)
-    ReductionOp             reduction_op)               ///< [in] Reduction operator
-{
-    // Specialize thread block abstraction type for reducing a range of values by key
-    typedef BlockSegReduceRegionByKey<
-            BlockSegReduceRegionByKeyPolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            ReductionOp>
-        BlockSegReduceRegionByKey;
-
-    // Shared memory allocation
-    __shared__ typename BlockSegReduceRegionByKey::TempStorage temp_storage;
-
-    // Construct persistent thread block
-    BlockSegReduceRegionByKey thread_block(
-        temp_storage,
-        d_tuple_partials,
-        d_output,
-        identity,
-        reduction_op);
-
-    // Process input tiles
-    thread_block.ProcessRegion(
-        0,                          // Region start
-        num_tuple_partials,         // Region end
-        0,                          // First segment ID
-        num_segments);              // Last segment ID (one-past)
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
- */
-template <
-    typename ValueIterator,                     ///< Random-access input iterator type for reading values
-    typename SegmentOffsetIterator,             ///< Random-access input iterator type for reading segment end-offsets
-    typename OutputIteratorT,                   ///< Random-access output iterator type for writing segment reductions
-    typename ReductionOp,                       ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename OffsetT>                           ///< Signed integer type for global offsets
-struct DeviceSegReduceDispatch
-{
-    // Value type
-    typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-    // Reduce-by-key data type tuple (segment-ID, value)
-    typedef KeyValuePair<OffsetT, Value> KeyValuePair;
-
-    // Index pair data type
-    typedef IndexPair<OffsetT>IndexPair;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        // ReduceRegionPolicy
-        typedef BlockSegReduceRegionPolicy<
-                128,                            ///< Threads per thread block
-                6,                              ///< Items per thread (per tile of input)
-                true,                           ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
-                LOAD_LDG,                       ///< Cache load modifier for reading values
-                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionPolicy;
-
-        // ReduceRegionByKeyPolicy
-        typedef BlockSegReduceRegionByKeyPolicy<
-                256,                            ///< Threads per thread block
-                9,                             ///< Items per thread (per tile of input)
-                BLOCK_LOAD_DIRECT,              ///< The BlockLoad algorithm to use
-                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-                LOAD_LDG,                       ///< Cache load modifier for reading input elements
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionByKeyPolicy;
-    };
-
-
-    /// SM10
-    struct Policy100
-    {
-        // ReduceRegionPolicy
-        typedef BlockSegReduceRegionPolicy<
-                128,                            ///< Threads per thread block
-                3,                              ///< Items per thread (per tile of input)
-                false,                          ///< Whether or not to cache incoming segment offsets in shared memory before reducing each tile
-                false,                          ///< Whether or not to cache incoming values in shared memory before reducing each tile
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading segment offsets
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading values
-                BLOCK_REDUCE_RAKING,            ///< The BlockReduce algorithm to use
-                BLOCK_SCAN_RAKING>              ///< The BlockScan algorithm to use
-            SegReduceRegionPolicy;
-
-        // ReduceRegionByKeyPolicy
-        typedef BlockSegReduceRegionByKeyPolicy<
-                128,                            ///< Threads per thread block
-                3,                              ///< Items per thread (per tile of input)
-                BLOCK_LOAD_WARP_TRANSPOSE,      ///< The BlockLoad algorithm to use
-                false,                          ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-                LOAD_DEFAULT,                   ///< Cache load modifier for reading input elements
-                BLOCK_SCAN_WARP_SCANS>          ///< The BlockScan algorithm to use
-            SegReduceRegionByKeyPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-/*
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-*/
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSegReduceRegionPolicy           : PtxPolicy::SegReduceRegionPolicy {};
-    struct PtxSegReduceRegionByKeyPolicy      : PtxPolicy::SegReduceRegionByKeyPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <
-        typename SegReduceKernelConfig,
-        typename SegReduceByKeyKernelConfig>
-    __host__ __device__ __forceinline__
-    static void InitConfigs(
-        int                         ptx_version,
-        SegReduceKernelConfig       &seg_reduce_region_config,
-        SegReduceByKeyKernelConfig  &seg_reduce_region_by_key_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        seg_reduce_region_config.Init<PtxSegReduceRegionPolicy>();
-        seg_reduce_region_by_key_config.Init<PtxSegReduceRegionByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            seg_reduce_region_config.template          Init<typename Policy350::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy350::SegReduceRegionByKeyPolicy>();
-        }
-/*
-        else if (ptx_version >= 300)
-        {
-            seg_reduce_region_config.template          Init<typename Policy300::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy300::SegReduceRegionByKeyPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            seg_reduce_region_config.template          Init<typename Policy200::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy200::SegReduceRegionByKeyPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            seg_reduce_region_config.template          Init<typename Policy130::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy130::SegReduceRegionByKeyPolicy>();
-        }
-*/
-        else
-        {
-            seg_reduce_region_config.template          Init<typename Policy100::SegReduceRegionPolicy>();
-            seg_reduce_region_by_key_config.template   Init<typename Policy100::SegReduceRegionByKeyPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * SegReduceRegionKernel kernel dispatch configuration
-     */
-    struct SegReduceKernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        bool                    use_smem_segment_cache;
-        bool                    use_smem_value_cache;
-        CacheLoadModifier       load_modifier_segments;
-        CacheLoadModifier       load_modifier_values;
-        BlockReduceAlgorithm    reduce_algorithm;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename SegReduceRegionPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = SegReduceRegionPolicy::BLOCK_THREADS;
-            items_per_thread            = SegReduceRegionPolicy::ITEMS_PER_THREAD;
-            use_smem_segment_cache      = SegReduceRegionPolicy::USE_SMEM_SEGMENT_CACHE;
-            use_smem_value_cache        = SegReduceRegionPolicy::USE_SMEM_VALUE_CACHE;
-            load_modifier_segments      = SegReduceRegionPolicy::LOAD_MODIFIER_SEGMENTS;
-            load_modifier_values        = SegReduceRegionPolicy::LOAD_MODIFIER_VALUES;
-            reduce_algorithm            = SegReduceRegionPolicy::REDUCE_ALGORITHM;
-            scan_algorithm              = SegReduceRegionPolicy::SCAN_ALGORITHM;
-        }
-    };
-
-    /**
-     * SegReduceRegionByKeyKernel kernel dispatch configuration
-     */
-    struct SegReduceByKeyKernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_algorithm;
-        bool                    load_warp_time_slicing;
-        CacheLoadModifier       load_modifier;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename SegReduceRegionByKeyPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = SegReduceRegionByKeyPolicy::BLOCK_THREADS;
-            items_per_thread            = SegReduceRegionByKeyPolicy::ITEMS_PER_THREAD;
-            load_algorithm              = SegReduceRegionByKeyPolicy::LOAD_ALGORITHM;
-            load_warp_time_slicing      = SegReduceRegionByKeyPolicy::LOAD_WARP_TIME_SLICING;
-            load_modifier               = SegReduceRegionByKeyPolicy::LOAD_MODIFIER;
-            scan_algorithm              = SegReduceRegionByKeyPolicy::SCAN_ALGORITHM;
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide segmented reduction.
-     */
-    template <
-        typename                        SegReducePartitionKernelPtr,
-        typename                        SegReduceRegionKernelPtr,               ///< Function type of cub::SegReduceRegionKernel
-        typename                        SegReduceRegionByKeyKernelPtr>          ///< Function type of cub::SegReduceRegionByKeyKernel
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
-        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
-        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous,                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                             sm_version,                             ///< [in] SM version of target device to use when computing SM occupancy
-        SegReducePartitionKernelPtr     seg_reduce_partition_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
-        SegReduceRegionKernelPtr        seg_reduce_region_kernel,               ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionKernel
-        SegReduceRegionByKeyKernelPtr   seg_reduce_region_by_key_kernel,        ///< [in] Kernel function pointer to parameterization of cub::SegReduceRegionByKeyKernel
-        SegReduceKernelConfig           &seg_reduce_region_config,              ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_kernel was compiled for
-        SegReduceByKeyKernelConfig      &seg_reduce_region_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p seg_reduce_region_by_key_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Dispatch two kernels: (1) a multi-block segmented reduction
-            // to reduce regions by block, and (2) a single-block reduce-by-key kernel
-            // to "fix up" segments spanning more than one region.
-
-            // Tile size of seg_reduce_region_kernel
-            int tile_size = seg_reduce_region_config.block_threads * seg_reduce_region_config.items_per_thread;
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for histogram_region_kernel
-            int seg_reduce_region_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                seg_reduce_region_sm_occupancy,
-                sm_version,
-                seg_reduce_region_kernel,
-                seg_reduce_region_config.block_threads))) break;
-
-            // Get device occupancy for histogram_region_kernel
-            int seg_reduce_region_occupancy = seg_reduce_region_sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int num_diagonals = num_values + num_segments;                  // Total number of work items
-            int subscription_factor = seg_reduce_region_sm_occupancy;       // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
-            int max_grid_size = seg_reduce_region_occupancy * subscription_factor;
-            GridEvenShare<OffsetT>even_share(
-                num_diagonals,
-                max_grid_size,
-                tile_size);
-
-            // Get grid size for seg_reduce_region_kernel
-            int seg_reduce_region_grid_size = even_share.grid_size;
-
-            // Number of "fix-up" reduce-by-key tuples (2 per thread block)
-            int num_tuple_partials = seg_reduce_region_grid_size * 2;
-            int num_partition_samples = seg_reduce_region_grid_size + 1;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                num_tuple_partials * sizeof(KeyValuePair),  // bytes needed for "fix-up" reduce-by-key tuples
-                num_partition_samples * sizeof(IndexPair),  // bytes needed block indices
-            };
-
-            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocations
-            KeyValuePair    *d_tuple_partials   = (KeyValuePair*) allocations[0];           // "fix-up" tuples
-            IndexPair       *d_block_idx        = (IndexPair *) allocations[1];             // block starting/ending indices
-
-            // Array of segment end-offsets
-            SegmentOffsetIterator d_segment_end_offsets = d_segment_offsets + 1;
-
-            // Grid launch params for seg_reduce_partition_kernel
-            int partition_block_size = 32;
-            int partition_grid_size = (num_partition_samples + partition_block_size - 1) / partition_block_size;
-
-            // Partition work among multiple thread blocks if necessary
-            if (seg_reduce_region_grid_size > 1)
-            {
-                // Log seg_reduce_partition_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking seg_reduce_partition_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    partition_grid_size, partition_block_size, (long long) stream);
-
-                // Invoke seg_reduce_partition_kernel
-                seg_reduce_partition_kernel<<<partition_grid_size, partition_block_size, 0, stream>>>(
-                    d_segment_end_offsets,  ///< [in] A sequence of \p num_segments segment end-offsets
-                    d_block_idx,
-                    num_partition_samples,
-                    num_values,             ///< [in] Number of values to reduce
-                    num_segments,           ///< [in] Number of segments being reduced
-                    even_share);            ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-
-                // Sync the stream if specified
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            // Log seg_reduce_region_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking seg_reduce_region_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, (long long) stream, seg_reduce_region_config.items_per_thread, seg_reduce_region_sm_occupancy);
-
-            // Mooch
-            if (CubDebug(error = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte))) break;
-
-            // Invoke seg_reduce_region_kernel
-            seg_reduce_region_kernel<<<seg_reduce_region_grid_size, seg_reduce_region_config.block_threads, 0, stream>>>(
-                d_segment_end_offsets,
-                d_values,
-                d_output,
-                d_tuple_partials,
-                d_block_idx,
-                num_values,
-                num_segments,
-                identity,
-                reduction_op,
-                even_share);
-
-            // Sync the stream if specified
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-/*
-            // Perform "fix-up" of region partial reductions if grid size is greater than one thread block
-            if (seg_reduce_region_grid_size > 1)
-            {
-                // Log seg_reduce_region_by_key_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking seg_reduce_region_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, seg_reduce_region_by_key_config.block_threads, (long long) stream, seg_reduce_region_by_key_config.items_per_thread);
-
-                // Invoke seg_reduce_region_by_key_kernel
-                seg_reduce_region_by_key_kernel<<<1, seg_reduce_region_by_key_config.block_threads, 0, stream>>>(
-                    d_tuple_partials,
-                    d_output,
-                    num_segments,
-                    num_tuple_partials,
-                    identity,
-                    reduction_op);
-
-                // Sync the stream if specified
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-*/
-        }
-
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide segmented reduction.
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator                   d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator           d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT                  d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        OffsetT                         num_values,                             ///< [in] Total number of values to reduce
-        OffsetT                         num_segments,                           ///< [in] Number of segments being reduced
-        Value                           identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp                     reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t                    stream,                                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous)                      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            SegReduceKernelConfig seg_reduce_region_config;
-            SegReduceByKeyKernelConfig seg_reduce_region_by_key_config;
-
-            InitConfigs(ptx_version, seg_reduce_region_config, seg_reduce_region_by_key_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_values,
-                d_segment_offsets,
-                d_output,
-                num_values,
-                num_segments,
-                identity,
-                reduction_op,
-                stream,
-                debug_synchronous,
-                ptx_version,            // Use PTX version instead of SM version because, as a statically known quantity, this improves device-side launch dramatically but at the risk of imprecise occupancy calculation for mismatches
-                SegReducePartitionKernel<SegmentOffsetIterator, OffsetT>,
-                SegReduceRegionKernel<PtxSegReduceRegionPolicy, SegmentOffsetIterator, ValueIterator, OutputIteratorT, ReductionOp, OffsetT, Value>,
-                SegReduceRegionByKeyKernel<PtxSegReduceRegionByKeyPolicy, KeyValuePair*, OutputIteratorT, ReductionOp, OffsetT, Value>,
-                seg_reduce_region_config,
-                seg_reduce_region_by_key_config))) break;
-        }
-        while (0);
-
-        return error;
-
-    }
-};
-
-
-
-
-/******************************************************************************
- * DeviceSegReduce
- *****************************************************************************/
-
-/**
- * \brief DeviceSegReduce provides operations for computing a device-wide, parallel segmented reduction across a sequence of data items residing within global memory.
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a list of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- */
-struct DeviceSegReduce
-{
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * Does not support non-commutative reduction operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
-     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
-     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
-     * \tparam Value                    <b>[inferred]</b> Value type
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename                ValueIterator,
-        typename                SegmentOffsetIterator,
-        typename                OutputIteratorT,
-        typename                Value,
-        typename                ReductionOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t Reduce(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        int                     num_values,                             ///< [in] Total number of values to reduce
-        int                     num_segments,                           ///< [in] Number of segments being reduced
-        Value                   identity,                               ///< [in] Identity value (for zero-length segments)
-        ReductionOp             reduction_op,                           ///< [in] Reduction operator
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        typedef DeviceSegReduceDispatch<
-                ValueIterator,
-                SegmentOffsetIterator,
-                OutputIteratorT,
-                ReductionOp,
-                OffsetT>
-            DeviceSegReduceDispatch;
-
-        return DeviceSegReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_values,
-            d_segment_offsets,
-            d_output,
-            num_values,
-            num_segments,
-            identity,
-            reduction_op,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * Does not support non-commutative summation.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \tparam ValueIterator            <b>[inferred]</b> Random-access input iterator type for reading values
-     * \tparam SegmentOffsetIterator    <b>[inferred]</b> Random-access input iterator type for reading segment end-offsets
-     * \tparam OutputIteratorT           <b>[inferred]</b> Random-access output iterator type for writing segment reductions
-     */
-    template <
-        typename                ValueIterator,
-        typename                SegmentOffsetIterator,
-        typename                OutputIteratorT>
-    __host__ __device__ __forceinline__
-    static cudaError_t Sum(
-        void*               d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation.
-        ValueIterator           d_values,                               ///< [in] A sequence of \p num_values data to reduce
-        SegmentOffsetIterator   d_segment_offsets,                      ///< [in] A sequence of (\p num_segments + 1) segment offsets
-        OutputIteratorT          d_output,                               ///< [out] A sequence of \p num_segments segment totals
-        int                     num_values,                             ///< [in] Total number of values to reduce
-        int                     num_segments,                           ///< [in] Number of segments being reduced
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Value type
-        typedef typename std::iterator_traits<ValueIterator>::value_type Value;
-
-        Value identity = Value();
-        cub::Sum reduction_op;
-
-        typedef DeviceSegReduceDispatch<
-                ValueIterator,
-                SegmentOffsetIterator,
-                OutputIteratorT,
-                cub::Sum,
-                OffsetT>
-            DeviceSegReduceDispatch;
-
-        return DeviceSegReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_values,
-            d_segment_offsets,
-            d_output,
-            num_values,
-            num_segments,
-            identity,
-            reduction_op,
-            stream,
-            debug_synchronous);
-    }
-};
-
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem
- */
-template <typename OffsetT, typename Value>
-void Initialize(
-    GenMode         gen_mode,
-    Value           *h_values,
-    vector<OffsetT> &segment_offsets,
-    int             num_values,
-    int             avg_segment_size)
-{
-    // Initialize values
-//    if (g_verbose) printf("Values: ");
-    for (int i = 0; i < num_values; ++i)
-    {
-        InitValue(gen_mode, h_values[i], i);
-//        if (g_verbose) std::cout << h_values[i] << ", ";
-    }
-//    if (g_verbose) printf("\n\n");
-
-    // Initialize segment lengths
-    const unsigned int  MAX_INTEGER         = -1u;
-    const unsigned int  MAX_SEGMENT_LENGTH  = avg_segment_size * 2;
-    const double        SCALE_FACTOR        = double(MAX_SEGMENT_LENGTH) / double(MAX_INTEGER);
-
-    segment_offsets.push_back(0);
-
-    OffsetT consumed = 0;
-    OffsetT remaining = num_values;
-    while (remaining > 0)
-    {
-        // Randomly sample a 32-bit unsigned int
-        unsigned int segment_length;
-        RandomBits(segment_length);
-
-        // Scale to maximum segment length
-        segment_length = (unsigned int) (double(segment_length) * SCALE_FACTOR);
-        segment_length = CUB_MIN(segment_length, remaining);
-
-        consumed += segment_length;
-        remaining -= segment_length;
-
-        segment_offsets.push_back(consumed);
-    }
-}
-
-
-/**
- * Compute reference answer
- */
-template <typename OffsetT, typename Value>
-void ComputeReference(
-    Value       *h_values,
-    OffsetT     *h_segment_offsets,
-    Value       *h_reference,
-    int         num_segments,
-    Value       identity)
-{
-    if (g_verbose) printf("%d segment reductions: ", num_segments);
-    for (int segment = 0; segment < num_segments; ++segment)
-    {
-        h_reference[segment] = identity;
-
-        for (int i = h_segment_offsets[segment]; i < h_segment_offsets[segment + 1]; ++i)
-        {
-            h_reference[segment] += h_values[i];
-        }
-        if (g_verbose) std::cout << h_reference[segment] << ", ";
-    }
-    if (g_verbose) printf("\n\n");
-}
-
-
-/**
- * Simple test of device
- */
-template <
-    bool            CDP,
-    typename        OffsetT,
-    typename        Value,
-    typename        ReductionOp>
-void Test(
-    OffsetT         num_values,
-    int             avg_segment_size,
-    ReductionOp     reduction_op,
-    Value           identity,
-    char*           type_string)
-{
-    Value   *h_values = NULL;
-    Value   *h_reference = NULL;
-    OffsetT *h_segment_offsets = NULL;
-
-    printf("%d\n", num_values);
-
-    // Initialize problem on host
-    h_values = new Value[num_values];
-    vector<OffsetT> segment_offsets;
-    Initialize(UNIFORM, h_values, segment_offsets, num_values, avg_segment_size);
-
-    // Allocate simple offsets array and copy STL vector into it
-    h_segment_offsets = new OffsetT[segment_offsets.size()];
-    for (int i = 0; i < segment_offsets.size(); ++i)
-        h_segment_offsets[i] = segment_offsets[i];
-
-    OffsetT num_segments = segment_offsets.size() - 1;
-    if (g_verbose)
-    {
-        printf("%d segment offsets: ", num_segments);
-        for (int i = 0; i < num_segments; ++i)
-            std::cout << h_segment_offsets[i] << "(" << h_segment_offsets[i + 1] - h_segment_offsets[i] << "), ";
-        if (g_verbose) std::cout << std::endl << std::endl;
-    }
-
-    // Solve problem on host
-    h_reference = new Value[num_segments];
-    ComputeReference(h_values, h_segment_offsets, h_reference, num_segments, identity);
-
-    printf("\n\n%s cub::DeviceSegReduce::%s %d items (%d-byte %s), %d segments (%d-byte offset indices)\n",
-        (CDP) ? "CDP device invoked" : "Host-invoked",
-        (Equals<ReductionOp, Sum>::VALUE) ? "Sum" : "Reduce",
-        num_values, (int) sizeof(Value), type_string,
-        num_segments, (int) sizeof(OffsetT));
-    fflush(stdout);
-
-    // Allocate and initialize problem on device
-    Value   *d_values = NULL;
-    OffsetT *d_segment_offsets = NULL;
-    Value   *d_output = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_values));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_output, sizeof(Value) * num_segments));
-    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_values, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    // Request and allocate temporary storage
-    void    *d_temp_storage = NULL;
-    size_t  temp_storage_bytes = 0;
-    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output
-    CubDebugExit(cudaMemset(d_output, 0, sizeof(Value) * num_segments));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_output, num_segments, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        CubDebugExit(DeviceSegReduce::Sum(d_temp_storage, temp_storage_bytes, d_values, d_segment_offsets, d_output, num_values, num_segments, 0, false));
-    }
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_values) / avg_millis / 1000.0 / 1000.0;
-        float giga_bandwidth = giga_rate *
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-
-    // Device cleanup
-    if (d_values) CubDebugExit(g_allocator.DeviceFree(d_values));
-    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (d_output) CubDebugExit(g_allocator.DeviceFree(d_output));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Host cleanup
-    if (h_values)           delete[] h_values;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (h_reference)        delete[] h_reference;
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_values          = 32 * 1024 * 1024;
-    int avg_segment_size    = 500;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_values);
-    args.GetCmdLineArgument("ss", avg_segment_size);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "[--n=<input samples>]\n"
-            "[--ss=<average segment size>]\n"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    Test<false>((int) num_values, avg_segment_size, Sum(), (long long) 0, CUB_TYPE_STRING(long long));
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/experimental/histogram/histogram_cub.h b/external/cub/experimental/histogram/histogram_cub.h
deleted file mode 100644
index f33184a58b9..00000000000
--- a/external/cub/experimental/histogram/histogram_cub.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <cub/device/device_histogram.cuh>
-
-using namespace cub;
-
-template <
-    int         NUM_CHANNELS,
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_cub_histogram(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist, 
-    bool is_warmup)
-{
-    enum {
-        is_float = Equals<PixelType, float4>::VALUE,
-    };
-
-    typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
-    typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
-
-    // Setup data structures
-    unsigned int*       d_histogram[ACTIVE_CHANNELS];
-    int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
-        num_levels[CHANNEL] = NUM_BINS + 1;
-        lower_level[CHANNEL] = 0;
-        upper_level[CHANNEL] = (is_float) ? 1 : 256;
-    }
-
-    // Allocate temporary storage
-    size_t temp_storage_bytes = 0;
-    void *d_temp_storage = NULL;
-
-    SampleT* d_image_samples = (SampleT*) d_image;
-
-    // Get amount of temporary storage needed
-    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_image_samples,
-        d_histogram,
-        num_levels,
-        lower_level,
-        upper_level,
-        width * height, 
-        (cudaStream_t) 0,
-        is_warmup);
-
-    cudaMalloc(&d_temp_storage, temp_storage_bytes);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    // Compute histogram
-    DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
-        d_temp_storage,
-        temp_storage_bytes,
-        d_image_samples,
-        d_histogram,
-        num_levels,
-        lower_level,
-        upper_level,
-        width * height, 
-        (cudaStream_t) 0,
-        is_warmup);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_temp_storage);
-
-    return elapsed_millis;
-}
-
diff --git a/external/cub/experimental/histogram/histogram_gmem_atomics.h b/external/cub/experimental/histogram/histogram_gmem_atomics.h
deleted file mode 100644
index c3c9630d2e4..00000000000
--- a/external/cub/experimental/histogram/histogram_gmem_atomics.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <test/test_util.h>
-
-namespace histogram_gmem_atomics
-{
-    // Decode float4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        float* samples = reinterpret_cast<float*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-    }
-
-    // Decode uchar4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-    }
-
-    // Decode uchar1 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        bins[0] = (unsigned int) pixel.x;
-    }
-
-    // First-pass histogram kernel (binning into privatized counters)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS,
-        typename    PixelType>
-    __global__ void histogram_gmem_atomics(
-        const PixelType *in,
-        int width,
-        int height,
-        unsigned int *out)
-    {
-        // global position and size
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-        int nx = blockDim.x * gridDim.x;
-        int ny = blockDim.y * gridDim.y;
-
-        // threads in workgroup
-        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
-        int nt = blockDim.x * blockDim.y; // total threads in workgroup
-
-        // group index in 0..ngroups-1
-        int g = blockIdx.x + blockIdx.y * gridDim.x;
-
-        // initialize smem
-        unsigned int *gmem = out + g * NUM_PARTS;
-        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
-            gmem[i] = 0;
-        __syncthreads();
-
-        // process pixels (updates our group's partial histogram in gmem)
-        for (int col = x; col < width; col += nx)
-        {
-            for (int row = y; row < height; row += ny)
-            {
-                PixelType pixel = in[row * width + col];
-
-                unsigned int bins[ACTIVE_CHANNELS];
-                DecodePixel<NUM_BINS>(pixel, bins);
-
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                    atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
-            }
-        }
-    }
-
-    // Second pass histogram kernel (accumulation)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS>
-    __global__ void histogram_gmem_accum(
-        const unsigned int *in,
-        int n,
-        unsigned int *out)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i > ACTIVE_CHANNELS * NUM_BINS)
-            return; // out of range
-
-        unsigned int total = 0;
-        for (int j = 0; j < n; j++)
-            total += in[i + NUM_PARTS * j];
-
-        out[i] = total;
-    }
-
-
-}   // namespace histogram_gmem_atomics
-
-
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_gmem_atomics(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist,
-    bool warmup)
-{
-    enum
-    {
-        NUM_PARTS = 1024
-    };
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, 0);
-
-    dim3 block(32, 4);
-    dim3 grid(16, 16);
-    int total_blocks = grid.x * grid.y;
-
-    // allocate partial histogram
-    unsigned int *d_part_hist;
-    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
-
-    dim3 block2(128);
-    dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
-        d_image,
-        width,
-        height,
-        d_part_hist);
-
-    histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
-        d_part_hist,
-        total_blocks,
-        d_hist);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_part_hist);
-
-    return elapsed_millis;
-}
-
diff --git a/external/cub/experimental/histogram/histogram_smem_atomics.h b/external/cub/experimental/histogram/histogram_smem_atomics.h
deleted file mode 100644
index 5703d81133f..00000000000
--- a/external/cub/experimental/histogram/histogram_smem_atomics.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <test/test_util.h>
-
-namespace histogram_smem_atomics
-{
-    // Decode float4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        float* samples = reinterpret_cast<float*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-    }
-
-    // Decode uchar4 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-    }
-
-    // Decode uchar1 pixel into bins
-    template <int NUM_BINS, int ACTIVE_CHANNELS>
-    __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-    {
-        bins[0] = (unsigned int) pixel.x;
-    }
-
-    // First-pass histogram kernel (binning into privatized counters)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS,
-        typename    PixelType>
-    __global__ void histogram_smem_atomics(
-        const PixelType *in,
-        int width,
-        int height,
-        unsigned int *out)
-    {
-        // global position and size
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-        int nx = blockDim.x * gridDim.x;
-        int ny = blockDim.y * gridDim.y;
-
-        // threads in workgroup
-        int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
-        int nt = blockDim.x * blockDim.y; // total threads in workgroup
-
-        // group index in 0..ngroups-1
-        int g = blockIdx.x + blockIdx.y * gridDim.x;
-
-        // initialize smem
-        __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
-        for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
-            smem[i] = 0;
-        __syncthreads();
-
-        // process pixels
-        // updates our group's partial histogram in smem
-        for (int col = x; col < width; col += nx)
-        {
-            for (int row = y; row < height; row += ny)
-            {
-                PixelType pixel = in[row * width + col];
-
-                unsigned int bins[ACTIVE_CHANNELS];
-                DecodePixel<NUM_BINS>(pixel, bins);
-
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                    atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
-            }
-        }
-
-        __syncthreads();
-
-        // move to our workgroup's slice of output
-        out += g * NUM_PARTS;
-
-        // store local output to global
-        for (int i = t; i < NUM_BINS; i += nt)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
-        }
-    }
-
-    // Second pass histogram kernel (accumulation)
-    template <
-        int         NUM_PARTS,
-        int         ACTIVE_CHANNELS,
-        int         NUM_BINS>
-    __global__ void histogram_smem_accum(
-        const unsigned int *in,
-        int n,
-        unsigned int *out)
-    {
-        int i = blockIdx.x * blockDim.x + threadIdx.x;
-        if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
-        unsigned int total = 0;
-        for (int j = 0; j < n; j++)
-            total += in[i + NUM_PARTS * j];
-        out[i] = total;
-    }
-
-}   // namespace histogram_smem_atomics
-
-
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-double run_smem_atomics(
-    PixelType *d_image,
-    int width,
-    int height,
-    unsigned int *d_hist, 
-    bool warmup)
-{
-    enum
-    {
-        NUM_PARTS = 1024
-    };
-
-    cudaDeviceProp props;
-    cudaGetDeviceProperties(&props, 0);
-
-    dim3 block(32, 4);
-    dim3 grid(16, 16);
-    int total_blocks = grid.x * grid.y;
-
-    // allocate partial histogram
-    unsigned int *d_part_hist;
-    cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
-
-    dim3 block2(128);
-    dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
-
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
-        d_image,
-        width,
-        height,
-        d_part_hist);
-
-    histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
-        d_part_hist,
-        total_blocks,
-        d_hist);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    cudaFree(d_part_hist);
-
-    return elapsed_millis;
-}
-
diff --git a/external/cub/experimental/histogram_compare.cu b/external/cub/experimental/histogram_compare.cu
deleted file mode 100644
index 0c72aafa847..00000000000
--- a/external/cub/experimental/histogram_compare.cu
+++ /dev/null
@@ -1,635 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-#include <stdio.h>
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <cstdio>
-#include <fstream>
-
-#include "histogram/histogram_gmem_atomics.h"
-#include "histogram/histogram_smem_atomics.h"
-#include "histogram/histogram_cub.h"
-
-#include <cub/util_allocator.cuh>
-#include <test/test_util.h>
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants, and type declarations
-//---------------------------------------------------------------------
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-bool                    g_verbose = false;  // Whether to display input/output to console
-bool                    g_report = false;   // Whether to display a full report in CSV format
-CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
-
-struct less_than_value
-{
-    inline bool operator()(
-        const std::pair<std::string, double> &a,
-        const std::pair<std::string, double> &b)
-    {
-        return a.second < b.second;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Targa (.tga) image file parsing
-//---------------------------------------------------------------------
-
-/**
- * TGA image header info
- */
-struct TgaHeader
-{
-    char idlength;
-    char colormaptype;
-    char datatypecode;
-    short colormaporigin;
-    short colormaplength;
-    char colormapdepth;
-    short x_origin;
-    short y_origin;
-    short width;
-    short height;
-    char bitsperpixel;
-    char imagedescriptor;
-
-    void Parse (FILE *fptr)
-    {
-        idlength = fgetc(fptr);
-        colormaptype = fgetc(fptr);
-        datatypecode = fgetc(fptr);
-        fread(&colormaporigin, 2, 1, fptr);
-        fread(&colormaplength, 2, 1, fptr);
-        colormapdepth = fgetc(fptr);
-        fread(&x_origin, 2, 1, fptr);
-        fread(&y_origin, 2, 1, fptr);
-        fread(&width, 2, 1, fptr);
-        fread(&height, 2, 1, fptr);
-        bitsperpixel = fgetc(fptr);
-        imagedescriptor = fgetc(fptr);
-    }
-
-    void Display (FILE *fptr)
-    {
-        fprintf(fptr, "ID length:           %d\n", idlength);
-        fprintf(fptr, "Color map type:      %d\n", colormaptype);
-        fprintf(fptr, "Image type:          %d\n", datatypecode);
-        fprintf(fptr, "Color map offset:    %d\n", colormaporigin);
-        fprintf(fptr, "Color map length:    %d\n", colormaplength);
-        fprintf(fptr, "Color map depth:     %d\n", colormapdepth);
-        fprintf(fptr, "X origin:            %d\n", x_origin);
-        fprintf(fptr, "Y origin:            %d\n", y_origin);
-        fprintf(fptr, "Width:               %d\n", width);
-        fprintf(fptr, "Height:              %d\n", height);
-        fprintf(fptr, "Bits per pixel:      %d\n", bitsperpixel);
-        fprintf(fptr, "Descriptor:          %d\n", imagedescriptor);
-    }
-};
-
-
-/**
- * Decode image byte data into pixel
- */
-void ParseTgaPixel(uchar4 &pixel, unsigned char *tga_pixel, int bytes)
-{
-    if (bytes == 4)
-    {
-        pixel.x = tga_pixel[2];
-        pixel.y = tga_pixel[1];
-        pixel.z = tga_pixel[0];
-        pixel.w = tga_pixel[3];
-    }
-    else if (bytes == 3)
-    {
-        pixel.x = tga_pixel[2];
-        pixel.y = tga_pixel[1];
-        pixel.z = tga_pixel[0];
-        pixel.w = 0;
-    }
-    else if (bytes == 2)
-    {
-        pixel.x = (tga_pixel[1] & 0x7c) << 1;
-        pixel.y = ((tga_pixel[1] & 0x03) << 6) | ((tga_pixel[0] & 0xe0) >> 2);
-        pixel.z = (tga_pixel[0] & 0x1f) << 3;
-        pixel.w = (tga_pixel[1] & 0x80);
-    }
-}
-
-
-/**
- * Reads a .tga image file
- */
-void ReadTga(uchar4* &pixels, int &width, int &height, const char *filename)
-{
-    // Open the file
-    FILE *fptr;
-    if ((fptr = fopen(filename, "rb")) == NULL)
-    {
-        fprintf(stderr, "File open failed\n");
-        exit(-1);
-    }
-
-    // Parse header
-    TgaHeader header;
-    header.Parse(fptr);
-//    header.Display(stdout);
-    width = header.width;
-    height = header.height;
-
-    // Verify compatibility
-    if (header.datatypecode != 2 && header.datatypecode != 10)
-    {
-        fprintf(stderr, "Can only handle image type 2 and 10\n");
-        exit(-1);
-    }
-    if (header.bitsperpixel != 16 && header.bitsperpixel != 24 && header.bitsperpixel != 32)
-    {
-        fprintf(stderr, "Can only handle pixel depths of 16, 24, and 32\n");
-        exit(-1);
-    }
-    if (header.colormaptype != 0 && header.colormaptype != 1)
-    {
-        fprintf(stderr, "Can only handle color map types of 0 and 1\n");
-        exit(-1);
-    }
-
-    // Skip unnecessary header info
-    int skip_bytes = header.idlength + (header.colormaptype * header.colormaplength);
-    fseek(fptr, skip_bytes, SEEK_CUR);
-
-    // Read the image
-    int pixel_bytes = header.bitsperpixel / 8;
-
-    // Allocate and initialize pixel data
-    size_t image_bytes = width * height * sizeof(uchar4);
-    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
-    {
-        fprintf(stderr, "malloc of image failed\n");
-        exit(-1);
-    }
-    memset(pixels, 0, image_bytes);
-
-    // Parse pixels
-    unsigned char   tga_pixel[5];
-    int             current_pixel = 0;
-    while (current_pixel < header.width * header.height)
-    {
-        if (header.datatypecode == 2)
-        {
-            // Uncompressed
-            if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
-            {
-                fprintf(stderr, "Unexpected end of file at pixel %d  (uncompressed)\n", current_pixel);
-                exit(-1);
-            }
-            ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
-            current_pixel++;
-        }
-        else if (header.datatypecode == 10)
-        {
-            // Compressed
-            if (fread(tga_pixel, 1, pixel_bytes + 1, fptr) != pixel_bytes + 1)
-            {
-                fprintf(stderr, "Unexpected end of file at pixel %d (compressed)\n", current_pixel);
-                exit(-1);
-            }
-            int run_length = tga_pixel[0] & 0x7f;
-            ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
-            current_pixel++;
-
-            if (tga_pixel[0] & 0x80)
-            {
-                // RLE chunk
-                for (int i = 0; i < run_length; i++)
-                {
-                    ParseTgaPixel(pixels[current_pixel], &(tga_pixel[1]), pixel_bytes);
-                    current_pixel++;
-                }
-            }
-            else
-            {
-                // Normal chunk
-                for (int i = 0; i < run_length; i++)
-                {
-                    if (fread(tga_pixel, 1, pixel_bytes, fptr) != pixel_bytes)
-                    {
-                        fprintf(stderr, "Unexpected end of file at pixel %d (normal)\n", current_pixel);
-                        exit(-1);
-                    }
-                    ParseTgaPixel(pixels[current_pixel], tga_pixel, pixel_bytes);
-                    current_pixel++;
-                }
-            }
-        }
-    }
-
-    // Close file
-    fclose(fptr);
-}
-
-
-
-//---------------------------------------------------------------------
-// Random image generation
-//---------------------------------------------------------------------
-
-/**
- * Generate a random image with specified entropy
- */
-void GenerateRandomImage(uchar4* &pixels, int width, int height, int entropy_reduction)
-{
-    int num_pixels = width * height;
-    size_t image_bytes = num_pixels * sizeof(uchar4);
-    if ((pixels == NULL) && ((pixels = (uchar4*) malloc(image_bytes)) == NULL))
-    {
-        fprintf(stderr, "malloc of image failed\n");
-        exit(-1);
-    }
-
-    for (int i = 0; i < num_pixels; ++i)
-    {
-        RandomBits(pixels[i].x, entropy_reduction);
-        RandomBits(pixels[i].y, entropy_reduction);
-        RandomBits(pixels[i].z, entropy_reduction);
-        RandomBits(pixels[i].w, entropy_reduction);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Histogram verification
-//---------------------------------------------------------------------
-
-// Decode float4 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    float* samples = reinterpret_cast<float*>(&pixel);
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
-}
-
-// Decode uchar4 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
-
-    for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
-}
-
-// Decode uchar1 pixel into bins
-template <int NUM_BINS, int ACTIVE_CHANNELS>
-void DecodePixelGold(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
-{
-    bins[0] = (unsigned int) pixel.x;
-}
-
-
-// Compute reference histogram.  Specialized for uchar4
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void HistogramGold(PixelType *image, int width, int height, unsigned int* hist)
-{
-    memset(hist, 0, ACTIVE_CHANNELS * NUM_BINS * sizeof(unsigned int));
-
-    for (int i = 0; i < width; i++)
-    {
-        for (int j = 0; j < height; j++)
-        {
-            PixelType pixel = image[i + j * width];
-
-            unsigned int bins[ACTIVE_CHANNELS];
-            DecodePixelGold<NUM_BINS>(pixel, bins);
-
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                hist[(NUM_BINS * CHANNEL) + bins[CHANNEL]]++;
-            }
-        }
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Test execution
-//---------------------------------------------------------------------
-
-/**
- * Run a specific histogram implementation
- */
-template <
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void RunTest(
-    std::vector<std::pair<std::string, double> >&   timings,
-    PixelType*                                      d_pixels,
-    const int                                       width,
-    const int                                       height,
-    unsigned int *                                  d_hist,
-    unsigned int *                                  h_hist,
-    int                                             timing_iterations,
-    const char *                                    long_name,
-    const char *                                    short_name,
-    double (*f)(PixelType*, int, int, unsigned int*, bool))
-{
-    if (!g_report) printf("%s ", long_name); fflush(stdout);
-
-    // Run single test to verify (and code cache)
-    (*f)(d_pixels, width, height, d_hist, !g_report);
-
-    int compare = CompareDeviceResults(h_hist, d_hist, ACTIVE_CHANNELS * NUM_BINS, true, g_verbose);
-    if (!g_report) printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-
-    double elapsed_ms = 0;
-    for (int i = 0; i < timing_iterations; i++)
-    {
-        elapsed_ms += (*f)(d_pixels, width, height, d_hist, false);
-    }
-    double avg_us = (elapsed_ms / timing_iterations) * 1000;    // average in us
-    timings.push_back(std::pair<std::string, double>(short_name, avg_us));
-
-    if (!g_report)
-    {
-        printf("Avg time %.3f us (%d iterations)\n", avg_us, timing_iterations); fflush(stdout);
-    }
-    else
-    {
-        printf("%.3f, ", avg_us); fflush(stdout);
-    }
-
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Evaluate corpus of histogram implementations
- */
-template <
-    int         NUM_CHANNELS,
-    int         ACTIVE_CHANNELS,
-    int         NUM_BINS,
-    typename    PixelType>
-void TestMethods(
-    PixelType*  h_pixels,
-    int         height,
-    int         width,
-    int         timing_iterations,
-    double      bandwidth_GBs)
-{
-    // Copy data to gpu
-    PixelType* d_pixels;
-    size_t pixel_bytes = width * height * sizeof(PixelType);
-    CubDebugExit(g_allocator.DeviceAllocate((void**) &d_pixels, pixel_bytes));
-    CubDebugExit(cudaMemcpy(d_pixels, h_pixels, pixel_bytes, cudaMemcpyHostToDevice));
-
-    if (g_report) printf("%.3f, ", double(pixel_bytes) / bandwidth_GBs / 1000);
-
-    // Allocate results arrays on cpu/gpu
-    unsigned int *h_hist;
-    unsigned int *d_hist;
-    size_t histogram_bytes = NUM_BINS * ACTIVE_CHANNELS * sizeof(unsigned int);
-    h_hist = (unsigned int *) malloc(histogram_bytes);
-    g_allocator.DeviceAllocate((void **) &d_hist, histogram_bytes);
-
-    // Compute reference cpu histogram
-    HistogramGold<ACTIVE_CHANNELS, NUM_BINS>(h_pixels, width, height, h_hist);
-
-    // Store timings
-    std::vector<std::pair<std::string, double> > timings;
-
-    // Run experiments
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "CUB", "CUB", run_cub_histogram<NUM_CHANNELS, ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "Shared memory atomics", "smem atomics", run_smem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-    RunTest<ACTIVE_CHANNELS, NUM_BINS>(timings, d_pixels, width, height, d_hist, h_hist, timing_iterations,
-        "Global memory atomics", "gmem atomics", run_gmem_atomics<ACTIVE_CHANNELS, NUM_BINS, PixelType>);
-
-    // Report timings
-    if (!g_report)
-    {
-        std::sort(timings.begin(), timings.end(), less_than_value());
-        printf("Timings (us):\n");
-        for (int i = 0; i < timings.size(); i++)
-        {
-            double bandwidth = height * width * sizeof(PixelType) / timings[i].second / 1000;
-            printf("\t %.3f %s (%.3f GB/s, %.3f%% peak)\n", timings[i].second, timings[i].first.c_str(), bandwidth, bandwidth / bandwidth_GBs * 100);
-        }
-        printf("\n");
-    }
-
-    // Free data
-    CubDebugExit(g_allocator.DeviceFree(d_pixels));
-    CubDebugExit(g_allocator.DeviceFree(d_hist));
-    free(h_hist);
-}
-
-
-/**
- * Test different problem genres
- */
-void TestGenres(
-    uchar4*     uchar4_pixels,
-    int         height,
-    int         width,
-    int         timing_iterations,
-    double      bandwidth_GBs)
-{
-    int num_pixels = width * height;
-
-    {
-        if (!g_report) printf("1 channel uchar1 tests (256-bin):\n\n"); fflush(stdout);
-
-        size_t      image_bytes     = num_pixels * sizeof(uchar1);
-        uchar1*     uchar1_pixels   = (uchar1*) malloc(image_bytes);
-
-        // Convert to 1-channel (averaging first 3 channels)
-        for (int i = 0; i < num_pixels; ++i)
-        {
-            uchar1_pixels[i].x = (unsigned char)
-                (((unsigned int) uchar4_pixels[i].x +
-                  (unsigned int) uchar4_pixels[i].y +
-                  (unsigned int) uchar4_pixels[i].z) / 3);
-        }
-
-        TestMethods<1, 1, 256>(uchar1_pixels, width, height, timing_iterations, bandwidth_GBs);
-        free(uchar1_pixels);
-        if (g_report) printf(", ");
-    }
-
-    {
-        if (!g_report) printf("3/4 channel uchar4 tests (256-bin):\n\n"); fflush(stdout);
-        TestMethods<4, 3, 256>(uchar4_pixels, width, height, timing_iterations, bandwidth_GBs);
-        if (g_report) printf(", ");
-    }
-
-    {
-        if (!g_report) printf("3/4 channel float4 tests (256-bin):\n\n"); fflush(stdout);
-        size_t      image_bytes     = num_pixels * sizeof(float4);
-        float4*     float4_pixels   = (float4*) malloc(image_bytes);
-
-        // Convert to float4 with range [0.0, 1.0)
-        for (int i = 0; i < num_pixels; ++i)
-        {
-            float4_pixels[i].x = float(uchar4_pixels[i].x) / 256;
-            float4_pixels[i].y = float(uchar4_pixels[i].y) / 256;
-            float4_pixels[i].z = float(uchar4_pixels[i].z) / 256;
-            float4_pixels[i].w = float(uchar4_pixels[i].w) / 256;
-        }
-        TestMethods<4, 3, 256>(float4_pixels, width, height, timing_iterations, bandwidth_GBs);
-        free(float4_pixels);
-        if (g_report) printf("\n");
-    }
-}
-
-
-/**
- * Main
- */
-int main(int argc, char **argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf(
-            "%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "\n\t"
-                "--file=<.tga filename> "
-            "\n\t"
-                "--entropy=<-1 (0%), 0 (100%), 1 (81%), 2 (54%), 3 (34%), 4 (20%), ..."
-                "[--height=<default: 1080>] "
-                "[--width=<default: 1920>] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    std::string         filename;
-    int                 timing_iterations   = 100;
-    int                 entropy_reduction   = 0;
-    int                 height              = 1080;
-    int                 width               = 1920;
-
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_report = args.CheckCmdLineFlag("report");
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("file", filename);
-    args.GetCmdLineArgument("height", height);
-    args.GetCmdLineArgument("width", width);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get GPU device bandwidth (GB/s)
-    int device_ordinal, bus_width, mem_clock_khz;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&bus_width, cudaDevAttrGlobalMemoryBusWidth, device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&mem_clock_khz, cudaDevAttrMemoryClockRate, device_ordinal));
-    double bandwidth_GBs = double(bus_width) * mem_clock_khz * 2 / 8 / 1000 / 1000;
-
-    // Run test(s)
-    uchar4* uchar4_pixels = NULL;
-    if (!g_report)
-    {
-        if (!filename.empty())
-        {
-            // Parse targa file
-            ReadTga(uchar4_pixels, width, height, filename.c_str());
-            printf("File %s: width(%d) height(%d)\n\n", filename.c_str(), width, height); fflush(stdout);
-        }
-        else
-        {
-            // Generate image
-            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
-            printf("Random image: entropy-reduction(%d) width(%d) height(%d)\n\n", entropy_reduction, width, height); fflush(stdout);
-        }
-
-        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-    }
-    else
-    {
-        // Run test suite
-        printf("Test, MIN, RLE CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM, , MIN, RLE_CUB, SMEM, GMEM\n");
-
-        // Entropy reduction tests
-        for (entropy_reduction = 0; entropy_reduction < 5; ++entropy_reduction)
-        {
-            printf("entropy reduction %d, ", entropy_reduction);
-            GenerateRandomImage(uchar4_pixels, width, height, entropy_reduction);
-            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        }
-        printf("entropy reduction -1, ");
-        GenerateRandomImage(uchar4_pixels, width, height, -1);
-        TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        printf("\n");
-
-        // File image tests
-        std::vector<std::string> file_tests;
-        file_tests.push_back("animals");
-        file_tests.push_back("apples");
-        file_tests.push_back("sunset");
-        file_tests.push_back("cheetah");
-        file_tests.push_back("nature");
-        file_tests.push_back("operahouse");
-        file_tests.push_back("austin");
-        file_tests.push_back("cityscape");
-
-        for (int i = 0; i < file_tests.size(); ++i)
-        {
-            printf("%s, ", file_tests[i].c_str());
-            std::string filename = std::string("histogram/benchmark/") + file_tests[i] + ".tga";
-            ReadTga(uchar4_pixels, width, height, filename.c_str());
-            TestGenres(uchar4_pixels, height, width, timing_iterations, bandwidth_GBs);
-        }
-    }
-
-    free(uchar4_pixels);
-
-    CubDebugExit(cudaDeviceSynchronize());
-    printf("\n\n");
-
-    return 0;
-}
diff --git a/external/cub/experimental/sparse_matrix.h b/external/cub/experimental/sparse_matrix.h
deleted file mode 100644
index 5ac34a1de53..00000000000
--- a/external/cub/experimental/sparse_matrix.h
+++ /dev/null
@@ -1,1244 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Matrix data structures and parsing logic
- ******************************************************************************/
-
-#pragma once
-
-#include <cmath>
-#include <cstring>
-
-#include <iterator>
-#include <string>
-#include <algorithm>
-#include <iostream>
-#include <queue>
-#include <set>
-#include <fstream>
-#include <stdio.h>
-
-#ifdef CUB_MKL
-    #include <numa.h>
-    #include <mkl.h>
-#endif
-
-using namespace std;
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-struct GraphStats
-{
-    int         num_rows;
-    int         num_cols;
-    int         num_nonzeros;
-
-    double      diag_dist_mean;         // mean
-    double      diag_dist_std_dev;      // sample std dev
-    double      pearson_r;    // coefficient of variation
-
-    double      row_length_mean;        // mean
-    double      row_length_std_dev;     // sample std_dev
-    double      row_length_variation;   // coefficient of variation
-    double      row_length_skewness;    // skewness
-
-    void Display(bool show_labels = true)
-    {
-        if (show_labels)
-            printf("\n"
-                "\t num_rows: %d\n"
-                "\t num_cols: %d\n"
-                "\t num_nonzeros: %d\n"
-                "\t diag_dist_mean: %.2f\n"
-                "\t diag_dist_std_dev: %.2f\n"
-                "\t pearson_r: %f\n"
-                "\t row_length_mean: %.5f\n"
-                "\t row_length_std_dev: %.5f\n"
-                "\t row_length_variation: %.5f\n"
-                "\t row_length_skewness: %.5f\n",
-                    num_rows,
-                    num_cols,
-                    num_nonzeros,
-                    diag_dist_mean,
-                    diag_dist_std_dev,
-                    pearson_r,
-                    row_length_mean,
-                    row_length_std_dev,
-                    row_length_variation,
-                    row_length_skewness);
-        else
-            printf(
-                "%d, "
-                "%d, "
-                "%d, "
-                "%.2f, "
-                "%.2f, "
-                "%f, "
-                "%.5f, "
-                "%.5f, "
-                "%.5f, "
-                "%.5f, ",
-                    num_rows,
-                    num_cols,
-                    num_nonzeros,
-                    diag_dist_mean,
-                    diag_dist_std_dev,
-                    pearson_r,
-                    row_length_mean,
-                    row_length_std_dev,
-                    row_length_variation,
-                    row_length_skewness);
-    }
-};
-
-
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-
-/**
- * COO matrix type.  A COO matrix is just a vector of edge tuples.  Tuples are sorted
- * first by row, then by column.
- */
-template<typename ValueT, typename OffsetT>
-struct CooMatrix
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // COO edge tuple
-    struct CooTuple
-    {
-        OffsetT            row;
-        OffsetT            col;
-        ValueT             val;
-
-        CooTuple() {}
-        CooTuple(OffsetT row, OffsetT col) : row(row), col(col) {}
-        CooTuple(OffsetT row, OffsetT col, ValueT val) : row(row), col(col), val(val) {}
-
-        /**
-         * Comparator for sorting COO sparse format num_nonzeros
-         */
-        bool operator<(const CooTuple &other) const
-        {
-            if ((row < other.row) || ((row == other.row) && (col < other.col)))
-            {
-                return true;
-            }
-
-            return false;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Data members
-    //---------------------------------------------------------------------
-
-    // Fields
-    int                 num_rows;
-    int                 num_cols;
-    int                 num_nonzeros;
-    CooTuple*           coo_tuples;
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    // Constructor
-    CooMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), coo_tuples(NULL) {}
-
-
-    /**
-     * Clear
-     */
-    void Clear()
-    {
-        if (coo_tuples) delete[] coo_tuples;
-        coo_tuples = NULL;
-    }
-
-
-    // Destructor
-    ~CooMatrix()
-    {
-        Clear();
-    }
-
-
-    // Display matrix to stdout
-    void Display()
-    {
-        cout << "COO Matrix (" << num_rows << " rows, " << num_cols << " columns, " << num_nonzeros << " non-zeros):\n";
-        cout << "Ordinal, Row, Column, Value\n";
-        for (int i = 0; i < num_nonzeros; i++)
-        {
-            cout << '\t' << i << ',' << coo_tuples[i].row << ',' << coo_tuples[i].col << ',' << coo_tuples[i].val << "\n";
-        }
-    }
-
-
-    /**
-     * Builds a symmetric COO sparse from an asymmetric CSR matrix.
-     */
-    template <typename CsrMatrixT>
-    void InitCsrSymmetric(CsrMatrixT &csr_matrix)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = csr_matrix.num_cols;
-        num_cols        = csr_matrix.num_rows;
-        num_nonzeros    = csr_matrix.num_nonzeros * 2;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < csr_matrix.num_rows; ++row)
-        {
-            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
-            {
-                coo_tuples[nonzero].row = row;
-                coo_tuples[nonzero].col = csr_matrix.column_indices[nonzero];
-                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
-
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].row = coo_tuples[nonzero].col;
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].col = coo_tuples[nonzero].row;
-                coo_tuples[csr_matrix.num_nonzeros + nonzero].val = csr_matrix.values[nonzero];
-
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-    }
-
-    /**
-     * Builds a COO sparse from a relabeled CSR matrix.
-     */
-    template <typename CsrMatrixT>
-    void InitCsrRelabel(CsrMatrixT &csr_matrix, OffsetT* relabel_indices)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = csr_matrix.num_rows;
-        num_cols        = csr_matrix.num_cols;
-        num_nonzeros    = csr_matrix.num_nonzeros;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            for (OffsetT nonzero = csr_matrix.row_offsets[row]; nonzero < csr_matrix.row_offsets[row + 1]; ++nonzero)
-            {
-                coo_tuples[nonzero].row = relabel_indices[row];
-                coo_tuples[nonzero].col = relabel_indices[csr_matrix.column_indices[nonzero]];
-                coo_tuples[nonzero].val = csr_matrix.values[nonzero];
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-    }
-
-
-
-    /**
-     * Builds a METIS COO sparse from the given file.
-     */
-    void InitMetis(const string &metis_filename)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        // TODO
-    }
-
-
-    /**
-     * Builds a MARKET COO sparse from the given file.
-     */
-    void InitMarket(
-        const string&   market_filename,
-        ValueT          default_value       = 1.0,
-        bool            verbose             = false)
-    {
-        if (verbose) {
-            printf("Reading... "); fflush(stdout);
-        }
-
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        std::ifstream ifs;
-        ifs.open(market_filename.c_str(), std::ifstream::in);
-        if (!ifs.good())
-        {
-            fprintf(stderr, "Error opening file\n");
-            exit(1);
-        }
-
-        bool    array = false;
-        bool    symmetric = false;
-        bool    skew = false;
-        int     current_edge = -1;
-        char    line[1024];
-
-        if (verbose) {
-            printf("Parsing... "); fflush(stdout);
-        }
-
-        while (true)
-        {
-            ifs.getline(line, 1024);
-            if (!ifs.good())
-            {
-                // Done
-                break;
-            }
-
-            if (line[0] == '%')
-            {
-                // Comment
-                if (line[1] == '%')
-                {
-                    // Banner
-                    symmetric   = (strstr(line, "symmetric") != NULL);
-                    skew        = (strstr(line, "skew") != NULL);
-                    array       = (strstr(line, "array") != NULL);
-
-                    if (verbose) {
-                        printf("(symmetric: %d, skew: %d, array: %d) ", symmetric, skew, array); fflush(stdout);
-                    }
-                }
-            }
-            else if (current_edge == -1)
-            {
-                // Problem description
-                int nparsed = sscanf(line, "%d %d %d", &num_rows, &num_cols, &num_nonzeros);
-                if ((!array) && (nparsed == 3))
-                {
-                    if (symmetric)
-                        num_nonzeros *= 2;
-
-                    // Allocate coo matrix
-                    coo_tuples = new CooTuple[num_nonzeros];
-                    current_edge = 0;
-
-                }
-                else if (array && (nparsed == 2))
-                {
-                    // Allocate coo matrix
-                    num_nonzeros = num_rows * num_cols;
-                    coo_tuples = new CooTuple[num_nonzeros];
-                    current_edge = 0;
-                }
-                else
-                {
-                    fprintf(stderr, "Error parsing MARKET matrix: invalid problem description: %s\n", line);
-                    exit(1);
-                }
-
-            }
-            else
-            {
-                // Edge
-                if (current_edge >= num_nonzeros)
-                {
-                    fprintf(stderr, "Error parsing MARKET matrix: encountered more than %d num_nonzeros\n", num_nonzeros);
-                    exit(1);
-                }
-
-                int row, col;
-                double val;
-
-                if (array)
-                {
-                    if (sscanf(line, "%lf", &val) != 1)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed current_edge: '%s' at edge %d\n", line, current_edge);
-                        exit(1);
-                    }
-                    col = (current_edge / num_rows);
-                    row = (current_edge - (num_rows * col));
-
-                    coo_tuples[current_edge] = CooTuple(row, col, val);    // Convert indices to zero-based
-                }
-                else
-                {
-                    // Parse nonzero (note: using strtol and strtod is 2x faster than sscanf or istream parsing)
-                    char *l = line;
-                    char *t = NULL;
-
-                    // parse row
-                    row = strtol(l, &t, 0);
-                    if (t == l)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed row at edge %d\n", current_edge);
-                        exit(1);
-                    }
-                    l = t;
-
-                    // parse col
-                    col = strtol(l, &t, 0);
-                    if (t == l)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix: badly formed col at edge %d\n", current_edge);
-                        exit(1);
-                    }
-                    l = t;
-
-                    // parse val
-                    val = strtod(l, &t);
-                    if (t == l)
-                    {
-                        val = default_value;
-                    }
-/*
-                    int nparsed = sscanf(line, "%d %d %lf", &row, &col, &val);
-                    if (nparsed == 2)
-                    {
-                        // No value specified
-                        val = default_value;
-                        
-                    }
-                    else if (nparsed != 3)
-                    {
-                        fprintf(stderr, "Error parsing MARKET matrix 1: badly formed current_edge: %d parsed at edge %d\n", nparsed, current_edge);
-                        exit(1);
-                    }
-*/
-
-                    coo_tuples[current_edge] = CooTuple(row - 1, col - 1, val);    // Convert indices to zero-based
-
-                }
-
-                current_edge++;
-
-                if (symmetric && (row != col))
-                {
-                    coo_tuples[current_edge].row = coo_tuples[current_edge - 1].col;
-                    coo_tuples[current_edge].col = coo_tuples[current_edge - 1].row;
-                    coo_tuples[current_edge].val = coo_tuples[current_edge - 1].val * (skew ? -1 : 1);
-                    current_edge++;
-                }
-            }
-        }
-
-        // Adjust nonzero count (nonzeros along the diagonal aren't reversed)
-        num_nonzeros = current_edge;
-
-        if (verbose) {
-            printf("done. Ordering..."); fflush(stdout);
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        if (verbose) {
-            printf("done. "); fflush(stdout);
-        }
-
-        ifs.close();
-    }
-
-
-    /**
-     * Builds a dense matrix
-     */
-    int InitDense(
-        OffsetT     num_rows,
-        OffsetT     num_cols,
-        ValueT      default_value   = 1.0,
-        bool        verbose         = false)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        this->num_rows  = num_rows;
-        this->num_cols  = num_cols;
-
-        num_nonzeros    = num_rows * num_cols;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            for (OffsetT col = 0; col < num_cols; ++col)
-            {
-                coo_tuples[(row * num_cols) + col] = CooTuple(row, col, default_value);
-            }
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-    /**
-     * Builds a wheel COO sparse matrix having spokes spokes.
-     */
-    int InitWheel(
-        OffsetT     spokes,
-        ValueT      default_value   = 1.0,
-        bool        verbose         = false)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        num_rows        = spokes + 1;
-        num_cols        = num_rows;
-        num_nonzeros    = spokes * 2;
-        coo_tuples      = new CooTuple[num_nonzeros];
-
-        // Add spoke num_nonzeros
-        int current_edge = 0;
-        for (OffsetT i = 0; i < spokes; i++)
-        {
-            coo_tuples[current_edge] = CooTuple(0, i + 1, default_value);
-            current_edge++;
-        }
-
-        // Add rim
-        for (OffsetT i = 0; i < spokes; i++)
-        {
-            OffsetT dest = (i + 1) % spokes;
-            coo_tuples[current_edge] = CooTuple(i + 1, dest + 1, default_value);
-            current_edge++;
-        }
-
-        // Sort by rows, then columns
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-
-    /**
-     * Builds a square 2D grid CSR matrix.  Interior num_vertices have degree 5 when including
-     * a self-loop.
-     *
-     * Returns 0 on success, 1 on failure.
-     */
-    int InitGrid2d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            exit(1);
-        }
-
-        int     interior_nodes  = (width - 2) * (width - 2);
-        int     edge_nodes      = (width - 2) * 4;
-        int     corner_nodes    = 4;
-        num_rows                       = width * width;
-        num_cols                       = num_rows;
-        num_nonzeros                   = (interior_nodes * 4) + (edge_nodes * 3) + (corner_nodes * 2);
-
-        if (self_loop)
-            num_nonzeros += num_rows;
-
-        coo_tuples          = new CooTuple[num_nonzeros];
-        int current_edge    = 0;
-
-        for (OffsetT j = 0; j < width; j++)
-        {
-            for (OffsetT k = 0; k < width; k++)
-            {
-                OffsetT me = (j * width) + k;
-
-                // West
-                OffsetT neighbor = (j * width) + (k - 1);
-                if (k - 1 >= 0) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // East
-                neighbor = (j * width) + (k + 1);
-                if (k + 1 < width) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // North
-                neighbor = ((j - 1) * width) + k;
-                if (j - 1 >= 0) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                // South
-                neighbor = ((j + 1) * width) + k;
-                if (j + 1 < width) {
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-
-                if (self_loop)
-                {
-                    neighbor = me;
-                    coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                    current_edge++;
-                }
-            }
-        }
-
-        // Sort by rows, then columns, update dims
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-
-
-    /**
-     * Builds a square 3D grid COO sparse matrix.  Interior num_vertices have degree 7 when including
-     * a self-loop.  Values are unintialized, coo_tuples are sorted.
-     */
-    int InitGrid3d(OffsetT width, bool self_loop, ValueT default_value = 1.0)
-    {
-        if (coo_tuples)
-        {
-            fprintf(stderr, "Matrix already constructed\n");
-            return -1;
-        }
-
-        OffsetT interior_nodes  = (width - 2) * (width - 2) * (width - 2);
-        OffsetT face_nodes      = (width - 2) * (width - 2) * 6;
-        OffsetT edge_nodes      = (width - 2) * 12;
-        OffsetT corner_nodes    = 8;
-        num_cols                       = width * width * width;
-        num_rows                       = num_cols;
-        num_nonzeros                     = (interior_nodes * 6) + (face_nodes * 5) + (edge_nodes * 4) + (corner_nodes * 3);
-
-        if (self_loop)
-            num_nonzeros += num_rows;
-
-        coo_tuples          = new CooTuple[num_nonzeros];
-        int current_edge    = 0;
-
-        for (OffsetT i = 0; i < width; i++)
-        {
-            for (OffsetT j = 0; j < width; j++)
-            {
-                for (OffsetT k = 0; k < width; k++)
-                {
-
-                    OffsetT me = (i * width * width) + (j * width) + k;
-
-                    // Up
-                    OffsetT neighbor = (i * width * width) + (j * width) + (k - 1);
-                    if (k - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // Down
-                    neighbor = (i * width * width) + (j * width) + (k + 1);
-                    if (k + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // West
-                    neighbor = (i * width * width) + ((j - 1) * width) + k;
-                    if (j - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // East
-                    neighbor = (i * width * width) + ((j + 1) * width) + k;
-                    if (j + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // North
-                    neighbor = ((i - 1) * width * width) + (j * width) + k;
-                    if (i - 1 >= 0) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    // South
-                    neighbor = ((i + 1) * width * width) + (j * width) + k;
-                    if (i + 1 < width) {
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-
-                    if (self_loop)
-                    {
-                        neighbor = me;
-                        coo_tuples[current_edge] = CooTuple(me, neighbor, default_value);
-                        current_edge++;
-                    }
-                }
-            }
-        }
-
-        // Sort by rows, then columns, update dims
-        std::stable_sort(coo_tuples, coo_tuples + num_nonzeros);
-
-        return 0;
-    }
-};
-
-
-
-/******************************************************************************
- * COO matrix type
- ******************************************************************************/
-
-
-/**
- * CSR sparse format matrix
- */
-template<
-    typename ValueT,
-    typename OffsetT>
-struct CsrMatrix
-{
-    int         num_rows;
-    int         num_cols;
-    int         num_nonzeros;
-    OffsetT*    row_offsets;
-    OffsetT*    column_indices;
-    ValueT*     values;
-    bool        numa_malloc;
-
-    /**
-     * Constructor
-     */
-    CsrMatrix() : num_rows(0), num_cols(0), num_nonzeros(0), row_offsets(NULL), column_indices(NULL), values(NULL) 
-    {
-#ifdef CUB_MKL
-        numa_malloc = ((numa_available() >= 0) && (numa_num_task_nodes() > 1));
-#else
-        numa_malloc = false;
-#endif
-    }
-
-
-    /**
-     * Clear
-     */
-    void Clear()
-    {
-#ifdef CUB_MKL
-        if (numa_malloc) 
-        {
-            numa_free(row_offsets, sizeof(OffsetT) * (num_rows + 1));
-            numa_free(values, sizeof(ValueT) * num_nonzeros);
-            numa_free(column_indices, sizeof(OffsetT) * num_nonzeros);
-        }
-        else
-        {
-            if (row_offsets)    mkl_free(row_offsets);
-            if (column_indices) mkl_free(column_indices);
-            if (values)         mkl_free(values);
-        }
-
-#else
-        if (row_offsets)    delete[] row_offsets;
-        if (column_indices) delete[] column_indices;
-        if (values)         delete[] values;
-#endif
-
-        row_offsets = NULL;
-        column_indices = NULL;
-        values = NULL;
-    }
-
-    /**
-     * Destructor
-     */
-    ~CsrMatrix()
-    {
-        Clear();
-    }
-
-    GraphStats Stats()
-    {
-        GraphStats stats;
-        stats.num_rows = num_rows;
-        stats.num_cols = num_cols;
-        stats.num_nonzeros = num_nonzeros;
-
-        //
-        // Compute diag-distance statistics
-        //
-
-        OffsetT samples     = 0;
-        double  mean        = 0.0;
-        double  ss_tot      = 0.0;
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-                double x                = (col > row) ? col - row : row - col;
-
-                samples++;
-                double delta            = x - mean;
-                mean                    = mean + (delta / samples);
-                ss_tot                  += delta * (x - mean);
-            }
-        }
-        stats.diag_dist_mean            = mean;
-        double variance                 = ss_tot / samples;
-        stats.diag_dist_std_dev         = sqrt(variance);
-
-
-        //
-        // Compute deming statistics
-        //
-
-        samples         = 0;
-        double mean_x   = 0.0;
-        double mean_y   = 0.0;
-        double ss_x     = 0.0;
-        double ss_y     = 0.0;
-
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-
-                samples++;
-                double x                = col;
-                double y                = row;
-                double delta;
-
-                delta                   = x - mean_x;
-                mean_x                  = mean_x + (delta / samples);
-                ss_x                    += delta * (x - mean_x);
-
-                delta                   = y - mean_y;
-                mean_y                  = mean_y + (delta / samples);
-                ss_y                    += delta * (y - mean_y);
-            }
-        }
-
-        samples         = 0;
-        double s_xy     = 0.0;
-        double s_xxy    = 0.0;
-        double s_xyy    = 0.0;
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT nz_idx_start    = row_offsets[row];
-            OffsetT nz_idx_end      = row_offsets[row + 1];
-
-            for (int nz_idx = nz_idx_start; nz_idx < nz_idx_end; ++nz_idx)
-            {
-                OffsetT col             = column_indices[nz_idx];
-
-                samples++;
-                double x                = col;
-                double y                = row;
-
-                double xy =             (x - mean_x) * (y - mean_y);
-                double xxy =            (x - mean_x) * (x - mean_x) * (y - mean_y);
-                double xyy =            (x - mean_x) * (y - mean_y) * (y - mean_y);
-                double delta;
-
-                delta                   = xy - s_xy;
-                s_xy                    = s_xy + (delta / samples);
-
-                delta                   = xxy - s_xxy;
-                s_xxy                   = s_xxy + (delta / samples);
-
-                delta                   = xyy - s_xyy;
-                s_xyy                   = s_xyy + (delta / samples);
-            }
-        }
-
-        double s_xx     = ss_x / num_nonzeros;
-        double s_yy     = ss_y / num_nonzeros;
-
-        double deming_slope = (s_yy - s_xx + sqrt(((s_yy - s_xx) * (s_yy - s_xx)) + (4 * s_xy * s_xy))) / (2 * s_xy);
-
-        stats.pearson_r = (num_nonzeros * s_xy) / (sqrt(ss_x) * sqrt(ss_y));
-
-
-        //
-        // Compute row-length statistics
-        //
-
-        // Sample mean
-        stats.row_length_mean       = double(num_nonzeros) / num_rows;
-        variance                    = 0.0;
-        stats.row_length_skewness   = 0.0;
-        for (OffsetT row = 0; row < num_rows; ++row)
-        {
-            OffsetT length              = row_offsets[row + 1] - row_offsets[row];
-            double delta                = double(length) - stats.row_length_mean;
-            variance   += (delta * delta);
-            stats.row_length_skewness   += (delta * delta * delta);
-        }
-        variance                    /= num_rows;
-        stats.row_length_std_dev    = sqrt(variance);
-        stats.row_length_skewness   = (stats.row_length_skewness / num_rows) / pow(stats.row_length_std_dev, 3.0);
-        stats.row_length_variation  = stats.row_length_std_dev / stats.row_length_mean;
-
-        return stats;
-    }
-
-    /**
-     * Build CSR matrix from sorted COO matrix
-     */
-    void FromCoo(const CooMatrix<ValueT, OffsetT> &coo_matrix)
-    {
-        num_rows        = coo_matrix.num_rows;
-        num_cols        = coo_matrix.num_cols;
-        num_nonzeros    = coo_matrix.num_nonzeros;
-
-#ifdef CUB_MKL
-
-        if (numa_malloc)
-        {
-            numa_set_strict(1);
-//            numa_set_bind_policy(1);
-
-//        values          = (ValueT*) numa_alloc_interleaved(sizeof(ValueT) * num_nonzeros);
-//        row_offsets     = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * (num_rows + 1));
-//        column_indices  = (OffsetT*) numa_alloc_interleaved(sizeof(OffsetT) * num_nonzeros);
-
-            row_offsets     = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * (num_rows + 1), 0);
-            column_indices  = (OffsetT*) numa_alloc_onnode(sizeof(OffsetT) * num_nonzeros, 0);
-            values          = (ValueT*) numa_alloc_onnode(sizeof(ValueT) * num_nonzeros, 1);
-        }
-        else
-        {
-            values          = (ValueT*) mkl_malloc(sizeof(ValueT) * num_nonzeros, 4096);
-            row_offsets     = (OffsetT*) mkl_malloc(sizeof(OffsetT) * (num_rows + 1), 4096);
-            column_indices  = (OffsetT*) mkl_malloc(sizeof(OffsetT) * num_nonzeros, 4096);
-
-        }
-
-#else
-        row_offsets     = new OffsetT[num_rows + 1];
-        column_indices  = new OffsetT[num_nonzeros];
-        values          = new ValueT[num_nonzeros];
-#endif
-
-        OffsetT prev_row = -1;
-        for (OffsetT current_edge = 0; current_edge < num_nonzeros; current_edge++)
-        {
-            OffsetT current_row = coo_matrix.coo_tuples[current_edge].row;
-
-            // Fill in rows up to and including the current row
-            for (OffsetT row = prev_row + 1; row <= current_row; row++)
-            {
-                row_offsets[row] = current_edge;
-            }
-            prev_row = current_row;
-
-            column_indices[current_edge]    = coo_matrix.coo_tuples[current_edge].col;
-            values[current_edge]            = coo_matrix.coo_tuples[current_edge].val;
-        }
-
-        // Fill out any trailing edgeless vertices (and the end-of-list element)
-        for (OffsetT row = prev_row + 1; row <= num_rows; row++)
-        {
-            row_offsets[row] = num_nonzeros;
-        }
-    }
-
-
-    /**
-     * Display log-histogram to stdout
-     */
-    void DisplayHistogram()
-    {
-        // Initialize
-        int log_counts[9];
-        for (int i = 0; i < 9; i++)
-        {
-            log_counts[i] = 0;
-        }
-
-        // Scan
-        int max_log_length = -1;
-        for (OffsetT row = 0; row < num_rows; row++)
-        {
-            OffsetT length = row_offsets[row + 1] - row_offsets[row];
-
-            int log_length = -1;
-            while (length > 0)
-            {
-                length /= 10;
-                log_length++;
-            }
-            if (log_length > max_log_length)
-            {
-                max_log_length = log_length;
-            }
-
-            log_counts[log_length + 1]++;
-        }
-        printf("CSR matrix (%d rows, %d columns, %d non-zeros):\n", (int) num_rows, (int) num_cols, (int) num_nonzeros);
-        for (int i = -1; i < max_log_length + 1; i++)
-        {
-            printf("\tDegree 1e%d: \t%d (%.2f%%)\n", i, log_counts[i + 1], (float) log_counts[i + 1] * 100.0 / num_cols);
-        }
-        fflush(stdout);
-    }
-
-
-    /**
-     * Display matrix to stdout
-     */
-    void Display()
-    {
-        printf("Input Matrix:\n");
-        for (OffsetT row = 0; row < num_rows; row++)
-        {
-            printf("%d [@%d, #%d]: ", row, row_offsets[row], row_offsets[row + 1] - row_offsets[row]);
-            for (OffsetT current_edge = row_offsets[row]; current_edge < row_offsets[row + 1]; current_edge++)
-            {
-                printf("%d (%f), ", column_indices[current_edge], values[current_edge]);
-            }
-            printf("\n");
-        }
-        fflush(stdout);
-    }
-
-
-};
-
-
-
-/******************************************************************************
- * Matrix transformations
- ******************************************************************************/
-
-// Comparator for ordering rows by degree (lowest first), then by row-id (lowest first)
-template <typename OffsetT>
-struct OrderByLow
-{
-    OffsetT* row_degrees;
-    OrderByLow(OffsetT* row_degrees) : row_degrees(row_degrees) {}
-
-    bool operator()(const OffsetT &a, const OffsetT &b)
-    {
-        if (row_degrees[a] < row_degrees[b])
-            return true;
-        else if (row_degrees[a] > row_degrees[b])
-            return false;
-        else
-            return (a < b);
-    }
-};
-
-// Comparator for ordering rows by degree (highest first), then by row-id (lowest first)
-template <typename OffsetT>
-struct OrderByHigh
-{
-    OffsetT* row_degrees;
-    OrderByHigh(OffsetT* row_degrees) : row_degrees(row_degrees) {}
-
-    bool operator()(const OffsetT &a, const OffsetT &b)
-    {
-        if (row_degrees[a] > row_degrees[b])
-            return true;
-        else if (row_degrees[a] < row_degrees[b])
-            return false;
-        else
-            return (a < b);
-    }
-};
-
-
-
-/**
- * Reverse Cuthill-McKee
- */
-template <typename ValueT, typename OffsetT>
-void RcmRelabel(
-    CsrMatrix<ValueT, OffsetT>&     matrix,
-    OffsetT*                        relabel_indices)
-{
-    // Initialize row degrees
-    OffsetT* row_degrees_in     = new OffsetT[matrix.num_rows];
-    OffsetT* row_degrees_out    = new OffsetT[matrix.num_rows];
-    for (OffsetT row = 0; row < matrix.num_rows; ++row)
-    {
-        row_degrees_in[row]         = 0;
-        row_degrees_out[row]        = matrix.row_offsets[row + 1] - matrix.row_offsets[row];
-    }
-    for (OffsetT nonzero = 0; nonzero < matrix.num_nonzeros; ++nonzero)
-    {
-        row_degrees_in[matrix.column_indices[nonzero]]++;
-    }
-
-    // Initialize unlabeled set 
-    typedef std::set<OffsetT, OrderByLow<OffsetT> > UnlabeledSet;
-    typename UnlabeledSet::key_compare  unlabeled_comp(row_degrees_in);
-    UnlabeledSet                        unlabeled(unlabeled_comp);
-    for (OffsetT row = 0; row < matrix.num_rows; ++row)
-    {
-        relabel_indices[row]    = -1;
-        unlabeled.insert(row);
-    }
-
-    // Initialize queue set
-    std::deque<OffsetT> q;
-
-    // Process unlabeled vertices (traverse connected components)
-    OffsetT relabel_idx = 0;
-    while (!unlabeled.empty())
-    {
-        // Seed the unvisited frontier queue with the unlabeled vertex of lowest-degree
-        OffsetT vertex = *unlabeled.begin();
-        q.push_back(vertex);
-
-        while (!q.empty())
-        {
-            vertex = q.front();
-            q.pop_front();
-
-            if (relabel_indices[vertex] == -1)
-            {
-                // Update this vertex
-                unlabeled.erase(vertex);
-                relabel_indices[vertex] = relabel_idx;
-                relabel_idx++;
-
-                // Sort neighbors by degree
-                OrderByLow<OffsetT> neighbor_comp(row_degrees_in);
-                std::sort(
-                    matrix.column_indices + matrix.row_offsets[vertex],
-                    matrix.column_indices + matrix.row_offsets[vertex + 1],
-                    neighbor_comp);
-
-                // Inspect neighbors, adding to the out frontier if unlabeled
-                for (OffsetT neighbor_idx = matrix.row_offsets[vertex];
-                    neighbor_idx < matrix.row_offsets[vertex + 1];
-                    ++neighbor_idx)
-                {
-                    OffsetT neighbor = matrix.column_indices[neighbor_idx];
-                    q.push_back(neighbor);
-                }
-            }
-        }
-    }
-
-/*
-    // Reverse labels
-    for (int row = 0; row < matrix.num_rows; ++row)
-    {
-        relabel_indices[row] = matrix.num_rows - relabel_indices[row] - 1;
-    }
-*/
-
-    // Cleanup
-    if (row_degrees_in) delete[] row_degrees_in;
-    if (row_degrees_out) delete[] row_degrees_out;
-}
-
-
-/**
- * Reverse Cuthill-McKee
- */
-template <typename ValueT, typename OffsetT>
-void RcmRelabel(
-    CsrMatrix<ValueT, OffsetT>&     matrix,
-    bool                            verbose = false)
-{
-    // Do not process if not square
-    if (matrix.num_cols != matrix.num_rows)
-    {
-        if (verbose) {
-            printf("RCM transformation ignored (not square)\n"); fflush(stdout);
-        }
-        return;
-    }
-
-    // Initialize relabel indices
-    OffsetT* relabel_indices = new OffsetT[matrix.num_rows];
-
-    if (verbose) {
-        printf("RCM relabeling... "); fflush(stdout);
-    }
-
-    RcmRelabel(matrix, relabel_indices);
-
-    if (verbose) {
-        printf("done. Reconstituting... "); fflush(stdout);
-    }
-
-    // Create a COO matrix from the relabel indices
-    CooMatrix<ValueT, OffsetT> coo_matrix;
-    coo_matrix.InitCsrRelabel(matrix, relabel_indices);
-
-    // Reconstitute the CSR matrix from the sorted COO tuples
-    if (relabel_indices) delete[] relabel_indices;
-    matrix.Clear();
-    matrix.FromCoo(coo_matrix);
-
-    if (verbose) {
-        printf("done. "); fflush(stdout);
-    }
-}
-
-
-
-
diff --git a/external/cub/experimental/spmv_compare.cu b/external/cub/experimental/spmv_compare.cu
deleted file mode 100644
index 59e07503b0e..00000000000
--- a/external/cub/experimental/spmv_compare.cu
+++ /dev/null
@@ -1,917 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIAeBILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-//---------------------------------------------------------------------
-// SpMV comparison tool
-//---------------------------------------------------------------------
-
-#include <stdio.h>
-#include <map>
-#include <vector>
-#include <algorithm>
-#include <cstdio>
-#include <fstream>
-
-#include <cusparse.h>
-
-#include "sparse_matrix.h"
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <cub/device/device_spmv.cuh>
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/tex_ref_input_iterator.cuh>
-#include <test/test_util.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants, and type declarations
-//---------------------------------------------------------------------
-
-bool                    g_quiet     = false;        // Whether to display stats in CSV format
-bool                    g_verbose   = false;        // Whether to display output to console
-bool                    g_verbose2  = false;        // Whether to display input to console
-CachingDeviceAllocator  g_allocator(true);          // Caching allocator for device memory
-
-
-//---------------------------------------------------------------------
-// SpMV verification
-//---------------------------------------------------------------------
-
-// Compute reference SpMV y = Ax
-template <
-    typename ValueT,
-    typename OffsetT>
-void SpmvGold(
-    CsrMatrix<ValueT, OffsetT>&     a,
-    ValueT*                         vector_x,
-    ValueT*                         vector_y_in,
-    ValueT*                         vector_y_out,
-    ValueT                          alpha,
-    ValueT                          beta)
-{
-    for (OffsetT row = 0; row < a.num_rows; ++row)
-    {
-        ValueT partial = beta * vector_y_in[row];
-        for (
-            OffsetT offset = a.row_offsets[row];
-            offset < a.row_offsets[row + 1];
-            ++offset)
-        {
-            partial += alpha * a.values[offset] * vector_x[a.column_indices[offset]];
-        }
-        vector_y_out[row] = partial;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// GPU I/O proxy
-//---------------------------------------------------------------------
-
-/**
- * Read every matrix nonzero value, read every corresponding vector value
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ValueT,
-    typename    OffsetT,
-    typename    VectorItr>
-__launch_bounds__ (int(BLOCK_THREADS))
-__global__ void NonZeroIoKernel(
-    SpmvParams<ValueT, OffsetT> params,
-    VectorItr                   d_vector_x)
-{
-    enum
-    {
-        TILE_ITEMS      = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-
-    ValueT nonzero = 0.0;
-
-    int tile_idx = blockIdx.x;
-
-    OffsetT block_offset = tile_idx * TILE_ITEMS;
-
-    OffsetT column_indices[ITEMS_PER_THREAD];
-    ValueT values[ITEMS_PER_THREAD];
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        OffsetT nonzero_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
-
-        OffsetT* ci = params.d_column_indices + nonzero_idx;
-        ValueT*a = params.d_values + nonzero_idx;
-
-        column_indices[ITEM]    = (nonzero_idx < params.num_nonzeros) ? *ci : 0;
-        values[ITEM]            = (nonzero_idx < params.num_nonzeros) ? *a : 0.0;
-    }
-
-    __syncthreads();
-
-    // Read vector
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        ValueT vector_value    = ThreadLoad<LOAD_LDG>(params.d_vector_x + column_indices[ITEM]);
-        nonzero                += vector_value * values[ITEM];
-    }
-
-    __syncthreads();
-
-    if (block_offset < params.num_rows)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            OffsetT row_idx = block_offset + (ITEM * BLOCK_THREADS) + threadIdx.x;
-            if (row_idx < params.num_rows)
-            {
-                OffsetT row_end_offset = ThreadLoad<LOAD_DEFAULT>(params.d_row_end_offsets + row_idx);
-
-                if ((row_end_offset >= 0) && (nonzero == nonzero))
-                    params.d_vector_y[row_idx] = nonzero;
-            }
-        }
-    }
-
-}
-
-
-/**
- * Run GPU I/O proxy
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-float TestGpuCsrIoProxy(
-    SpmvParams<ValueT, OffsetT>&    params,
-    int                             timing_iterations)
-{
-    enum {
-        BLOCK_THREADS       = 128,
-        ITEMS_PER_THREAD    = 7,
-        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-//    size_t smem = 1024 * 16;
-    size_t smem = 1024 * 0;
-
-    unsigned int nonzero_blocks = (params.num_nonzeros + TILE_SIZE - 1) / TILE_SIZE;
-    unsigned int row_blocks = (params.num_rows + TILE_SIZE - 1) / TILE_SIZE;
-    unsigned int blocks = std::max(nonzero_blocks, row_blocks);
-
-    typedef TexRefInputIterator<ValueT, 1234, int> TexItr;
-    TexItr x_itr;
-    CubDebugExit(x_itr.BindTexture(params.d_vector_x));
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    void (*kernel)(SpmvParams<ValueT, OffsetT>, TexItr) = NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD>;
-
-
-    int spmv_sm_occupancy;
-    CubDebugExit(MaxSmOccupancy(spmv_sm_occupancy, kernel, BLOCK_THREADS, smem));
-
-    if (!g_quiet)
-        printf("NonZeroIoKernel<%d,%d><<<%d, %d>>>, sm occupancy %d\n", BLOCK_THREADS, ITEMS_PER_THREAD, blocks, BLOCK_THREADS, spmv_sm_occupancy);
-
-    // Warmup
-    NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
-
-    // Check for failures
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(SyncStream(0));
-
-    // Timing
-    GpuTimer timer;
-    float elapsed_millis = 0.0;
-    timer.Start();
-    for (int it = 0; it < timing_iterations; ++it)
-    {
-        NonZeroIoKernel<BLOCK_THREADS, ITEMS_PER_THREAD><<<blocks, BLOCK_THREADS, smem>>>(params, x_itr);
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    CubDebugExit(x_itr.UnbindTexture());
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-
-//---------------------------------------------------------------------
-// cuSparse HybMV
-//---------------------------------------------------------------------
-
-/**
- * Run cuSparse HYB SpMV (specialized for fp32)
- */
-template <
-    typename OffsetT>
-float TestCusparseHybmv(
-    float*                          vector_y_in,
-    float*                          reference_vector_y_out,
-    SpmvParams<float, OffsetT>&     params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    CpuTimer cpu_timer;
-    cpu_timer.Start();
-
-    // Construct Hyb matrix
-    cusparseMatDescr_t mat_desc;
-    cusparseHybMat_t hyb_desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
-    cusparseStatus_t status = cusparseScsr2hyb(
-        cusparse,
-        params.num_rows, params.num_cols,
-        mat_desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        hyb_desc,
-        0,
-        CUSPARSE_HYB_PARTITION_AUTO);
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, status);
-
-    cudaDeviceSynchronize();
-    cpu_timer.Stop();
-    float elapsed_millis = cpu_timer.ElapsedMillis();
-    printf("HYB setup ms, %.5f, ", elapsed_millis);
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
-        cusparse,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &params.alpha, mat_desc,
-        hyb_desc,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseShybmv(
-            cusparse,
-            CUSPARSE_OPERATION_NON_TRANSPOSE,
-            &params.alpha, mat_desc,
-            hyb_desc,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    // Cleanup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-/**
- * Run cuSparse HYB SpMV (specialized for fp64)
- */
-template <
-    typename OffsetT>
-float TestCusparseHybmv(
-    double*                         vector_y_in,
-    double*                         reference_vector_y_out,
-    SpmvParams<double, OffsetT>&    params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    CpuTimer cpu_timer;
-    cpu_timer.Start();
-
-    // Construct Hyb matrix
-    cusparseMatDescr_t mat_desc;
-    cusparseHybMat_t hyb_desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&mat_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateHybMat(&hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsr2hyb(
-        cusparse,
-        params.num_rows, params.num_cols,
-        mat_desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        hyb_desc,
-        0,
-        CUSPARSE_HYB_PARTITION_AUTO));
-
-    cudaDeviceSynchronize();
-    cpu_timer.Stop();
-    float elapsed_millis = cpu_timer.ElapsedMillis();
-    printf("HYB setup ms, %.5f, ", elapsed_millis);
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
-        cusparse,
-        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        &params.alpha, mat_desc,
-        hyb_desc,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDhybmv(
-            cusparse,
-            CUSPARSE_OPERATION_NON_TRANSPOSE,
-            &params.alpha, mat_desc,
-            hyb_desc,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    // Cleanup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyHybMat(hyb_desc));
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(mat_desc));
-
-    return elapsed_millis / timing_iterations;
-}
-
-
-
-//---------------------------------------------------------------------
-// cuSparse CsrMV
-//---------------------------------------------------------------------
-
-/**
- * Run cuSparse SpMV (specialized for fp32)
- */
-template <
-    typename OffsetT>
-float TestCusparseCsrmv(
-    float*                          vector_y_in,
-    float*                          reference_vector_y_out,
-    SpmvParams<float, OffsetT>&     params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    cusparseMatDescr_t desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
-        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    float elapsed_millis    = 0.0;
-    GpuTimer timer;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseScsrmv(
-            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
-    return elapsed_millis / timing_iterations;
-}
-
-
-/**
- * Run cuSparse SpMV (specialized for fp64)
- */
-template <
-    typename OffsetT>
-float TestCusparseCsrmv(
-    double*                         vector_y_in,
-    double*                         reference_vector_y_out,
-    SpmvParams<double, OffsetT>&    params,
-    int                             timing_iterations,
-    cusparseHandle_t                cusparse)
-{
-    cusparseMatDescr_t desc;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreateMatDescr(&desc));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(float) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
-        cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-        params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, &params.beta, params.d_vector_y));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    float elapsed_millis = 0.0;
-    GpuTimer timer;
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDcsrmv(
-            cusparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
-            params.num_rows, params.num_cols, params.num_nonzeros, &params.alpha, desc,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, &params.beta, params.d_vector_y));
-
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseDestroyMatDescr(desc));
-    return elapsed_millis / timing_iterations;
-}
-
-//---------------------------------------------------------------------
-// GPU Merge-based SpMV
-//---------------------------------------------------------------------
-
-/**
- * Run CUB SpMV
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-float TestGpuMergeCsrmv(
-    ValueT*                         vector_y_in,
-    ValueT*                         reference_vector_y_out,
-    SpmvParams<ValueT, OffsetT>&    params,
-    int                             timing_iterations)
-{
-    // Allocate temporary storage
-    size_t temp_storage_bytes = 0;
-    void *d_temp_storage = NULL;
-
-    // Get amount of temporary storage needed
-    CubDebugExit(DeviceSpmv::CsrMV(
-        d_temp_storage, temp_storage_bytes,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, params.d_vector_y,
-        params.num_rows, params.num_cols, params.num_nonzeros,
-// params.alpha, params.beta,
-        (cudaStream_t) 0, false));
-
-    // Allocate
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Reset input/output vector y
-    CubDebugExit(cudaMemcpy(params.d_vector_y, vector_y_in, sizeof(ValueT) * params.num_rows, cudaMemcpyHostToDevice));
-
-    // Warmup
-    CubDebugExit(DeviceSpmv::CsrMV(
-        d_temp_storage, temp_storage_bytes,
-        params.d_values, params.d_row_end_offsets, params.d_column_indices,
-        params.d_vector_x, params.d_vector_y,
-        params.num_rows, params.num_cols, params.num_nonzeros, 
-// params.alpha, params.beta,
-        (cudaStream_t) 0, !g_quiet));
-
-    if (!g_quiet)
-    {
-        int compare = CompareDeviceResults(reference_vector_y_out, params.d_vector_y, params.num_rows, true, g_verbose);
-        printf("\t%s\n", compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Timing
-    GpuTimer timer;
-    float elapsed_millis = 0.0;
-
-    timer.Start();
-    for(int it = 0; it < timing_iterations; ++it)
-    {
-        CubDebugExit(DeviceSpmv::CsrMV(
-            d_temp_storage, temp_storage_bytes,
-            params.d_values, params.d_row_end_offsets, params.d_column_indices,
-            params.d_vector_x, params.d_vector_y,
-            params.num_rows, params.num_cols, params.num_nonzeros, 
-// params.alpha, params.beta,
-            (cudaStream_t) 0, false));
-    }
-    timer.Stop();
-    elapsed_millis += timer.ElapsedMillis();
-
-    return elapsed_millis / timing_iterations;
-}
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-/**
- * Display perf
- */
-template <typename ValueT, typename OffsetT>
-void DisplayPerf(
-    float                           device_giga_bandwidth,
-    double                          avg_millis,
-    CsrMatrix<ValueT, OffsetT>&     csr_matrix)
-{
-    double nz_throughput, effective_bandwidth;
-    size_t total_bytes = (csr_matrix.num_nonzeros * (sizeof(ValueT) * 2 + sizeof(OffsetT))) +
-        (csr_matrix.num_rows) * (sizeof(OffsetT) + sizeof(ValueT));
-
-    nz_throughput       = double(csr_matrix.num_nonzeros) / avg_millis / 1.0e6;
-    effective_bandwidth = double(total_bytes) / avg_millis / 1.0e6;
-
-    if (!g_quiet)
-        printf("fp%d: %.4f avg ms, %.5f gflops, %.3lf effective GB/s (%.2f%% peak)\n",
-            sizeof(ValueT) * 8,
-            avg_millis,
-            2 * nz_throughput,
-            effective_bandwidth,
-            effective_bandwidth / device_giga_bandwidth * 100);
-    else
-        printf("%.5f, %.6f, %.3lf, %.2f%%, ",
-            avg_millis,
-            2 * nz_throughput,
-            effective_bandwidth,
-            effective_bandwidth / device_giga_bandwidth * 100);
-
-    fflush(stdout);
-}
-
-
-
-/**
- * Run tests
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-void RunTest(
-    bool                        rcm_relabel,
-    ValueT                      alpha,
-    ValueT                      beta,
-    CooMatrix<ValueT, OffsetT>& coo_matrix,
-    int                         timing_iterations,
-    CommandLineArgs&            args)
-{
-    // Adaptive timing iterations: run 16 billion nonzeros through
-    if (timing_iterations == -1)
-        timing_iterations = std::min(50000ull, std::max(100ull, ((16ull << 30) / coo_matrix.num_nonzeros)));
-
-    if (!g_quiet)
-        printf("\t%d timing iterations\n", timing_iterations);
-
-    // Convert to CSR
-    CsrMatrix<ValueT, OffsetT> csr_matrix;
-    csr_matrix.FromCoo(coo_matrix);
-    if (!args.CheckCmdLineFlag("csrmv"))
-        coo_matrix.Clear();
-
-    // Relabel
-    if (rcm_relabel)
-    {
-        if (!g_quiet)
-        {
-            csr_matrix.Stats().Display();
-            printf("\n");
-            csr_matrix.DisplayHistogram();
-            printf("\n");
-            if (g_verbose2)
-                csr_matrix.Display();
-            printf("\n");
-        }
-
-        RcmRelabel(csr_matrix, !g_quiet);
-
-        if (!g_quiet) printf("\n");
-    }
-
-    // Display matrix info
-    csr_matrix.Stats().Display(!g_quiet);
-    if (!g_quiet)
-    {
-        printf("\n");
-        csr_matrix.DisplayHistogram();
-        printf("\n");
-        if (g_verbose2)
-            csr_matrix.Display();
-        printf("\n");
-    }
-    fflush(stdout);
-
-    // Allocate input and output vectors
-    ValueT* vector_x        = new ValueT[csr_matrix.num_cols];
-    ValueT* vector_y_in     = new ValueT[csr_matrix.num_rows];
-    ValueT* vector_y_out    = new ValueT[csr_matrix.num_rows];
-
-    for (int col = 0; col < csr_matrix.num_cols; ++col)
-        vector_x[col] = 1.0;
-
-    for (int row = 0; row < csr_matrix.num_rows; ++row)
-        vector_y_in[row] = 1.0;
-
-    // Compute reference answer
-    SpmvGold(csr_matrix, vector_x, vector_y_in, vector_y_out, alpha, beta);
-
-    float avg_millis;
-
-    if (g_quiet) {
-        printf("%s, %s, ", args.deviceProp.name, (sizeof(ValueT) > 4) ? "fp64" : "fp32"); fflush(stdout);
-    }
-
-    // Get GPU device bandwidth (GB/s)
-    float device_giga_bandwidth = args.device_giga_bandwidth;
-
-    // Allocate and initialize GPU problem
-    SpmvParams<ValueT, OffsetT> params;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_values,          sizeof(ValueT) * csr_matrix.num_nonzeros));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_row_end_offsets, sizeof(OffsetT) * (csr_matrix.num_rows + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_x,        sizeof(ValueT) * csr_matrix.num_cols));
-    CubDebugExit(g_allocator.DeviceAllocate((void **) &params.d_vector_y,        sizeof(ValueT) * csr_matrix.num_rows));
-    params.num_rows         = csr_matrix.num_rows;
-    params.num_cols         = csr_matrix.num_cols;
-    params.num_nonzeros     = csr_matrix.num_nonzeros;
-    params.alpha            = alpha;
-    params.beta             = beta;
-
-    CubDebugExit(cudaMemcpy(params.d_values,            csr_matrix.values,          sizeof(ValueT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_row_end_offsets,   csr_matrix.row_offsets,     sizeof(OffsetT) * (csr_matrix.num_rows + 1), cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_column_indices,    csr_matrix.column_indices,  sizeof(OffsetT) * csr_matrix.num_nonzeros, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(params.d_vector_x,          vector_x,                   sizeof(ValueT) * csr_matrix.num_cols, cudaMemcpyHostToDevice));
-
-    if (!g_quiet) printf("\n\n");
-    printf("GPU CSR I/O Prox, "); fflush(stdout);
-    avg_millis = TestGpuCsrIoProxy(params, timing_iterations);
-    DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-
-    if (args.CheckCmdLineFlag("csrmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("CUB, "); fflush(stdout);
-        avg_millis = TestGpuMergeCsrmv(vector_y_in, vector_y_out, params, timing_iterations);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-    // Initialize cuSparse
-    cusparseHandle_t cusparse;
-    AssertEquals(CUSPARSE_STATUS_SUCCESS, cusparseCreate(&cusparse));
-
-    if (args.CheckCmdLineFlag("csrmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("Cusparse CsrMV, "); fflush(stdout);
-        avg_millis = TestCusparseCsrmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-    if (args.CheckCmdLineFlag("hybmv"))
-    {
-        if (!g_quiet) printf("\n\n");
-        printf("Cusparse HybMV, "); fflush(stdout);
-
-        avg_millis = TestCusparseHybmv(vector_y_in, vector_y_out, params, timing_iterations, cusparse);
-        DisplayPerf(device_giga_bandwidth, avg_millis, csr_matrix);
-    }
-
-
-    // Cleanup
-    if (params.d_values)            CubDebugExit(g_allocator.DeviceFree(params.d_values));
-    if (params.d_row_end_offsets)   CubDebugExit(g_allocator.DeviceFree(params.d_row_end_offsets));
-    if (params.d_column_indices)    CubDebugExit(g_allocator.DeviceFree(params.d_column_indices));
-    if (params.d_vector_x)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_x));
-    if (params.d_vector_y)          CubDebugExit(g_allocator.DeviceFree(params.d_vector_y));
-
-    if (vector_x)                   delete[] vector_x;
-    if (vector_y_in)                delete[] vector_y_in;
-    if (vector_y_out)               delete[] vector_y_out;
-}
-
-/**
- * Run tests
- */
-template <
-    typename ValueT,
-    typename OffsetT>
-void RunTests(
-    bool                rcm_relabel,
-    ValueT              alpha,
-    ValueT              beta,
-    const std::string&  mtx_filename,
-    int                 grid2d,
-    int                 grid3d,
-    int                 wheel,
-    int                 dense,
-    int                 timing_iterations,
-    CommandLineArgs&    args)
-{
-    // Initialize matrix in COO form
-    CooMatrix<ValueT, OffsetT> coo_matrix;
-
-    if (!mtx_filename.empty())
-    {
-        // Parse matrix market file
-        printf("%s, ", mtx_filename.c_str()); fflush(stdout);
-        coo_matrix.InitMarket(mtx_filename, 1.0, !g_quiet);
-
-        if ((coo_matrix.num_rows == 1) || (coo_matrix.num_cols == 1) || (coo_matrix.num_nonzeros == 1))
-        {
-            if (!g_quiet) printf("Trivial dataset\n");
-            exit(0);
-        }
-    }
-    else if (grid2d > 0)
-    {
-        // Generate 2D lattice
-        printf("grid2d_%d, ", grid2d); fflush(stdout);
-        coo_matrix.InitGrid2d(grid2d, false);
-    }
-    else if (grid3d > 0)
-    {
-        // Generate 3D lattice
-        printf("grid3d_%d, ", grid3d); fflush(stdout);
-        coo_matrix.InitGrid3d(grid3d, false);
-    }
-    else if (wheel > 0)
-    {
-        // Generate wheel graph
-        printf("wheel_%d, ", grid2d); fflush(stdout);
-        coo_matrix.InitWheel(wheel);
-    }
-    else if (dense > 0)
-    {
-        // Generate dense graph
-        OffsetT size = 1 << 24; // 16M nnz
-        args.GetCmdLineArgument("size", size);
-
-        OffsetT rows = size / dense;
-        printf("dense_%d_x_%d, ", rows, dense); fflush(stdout);
-        coo_matrix.InitDense(rows, dense);
-    }
-    else
-    {
-        fprintf(stderr, "No graph type specified.\n");
-        exit(1);
-    }
-
-    RunTest(
-        rcm_relabel,
-        alpha,
-        beta,
-        coo_matrix,
-        timing_iterations,
-        args);
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char **argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf(
-            "%s "
-            "[--csrmv | --hybmv | --bsrmv ] "
-            "[--device=<device-id>] "
-            "[--quiet] "
-            "[--v] "
-            "[--i=<timing iterations>] "
-            "[--fp64] "
-            "[--rcm] "
-            "[--alpha=<alpha scalar (default: 1.0)>] "
-            "[--beta=<beta scalar (default: 0.0)>] "
-            "\n\t"
-                "--mtx=<matrix market file> "
-            "\n\t"
-                "--dense=<cols>"
-            "\n\t"
-                "--grid2d=<width>"
-            "\n\t"
-                "--grid3d=<width>"
-            "\n\t"
-                "--wheel=<spokes>"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    bool                fp64;
-    bool                rcm_relabel;
-    std::string         mtx_filename;
-    int                 grid2d              = -1;
-    int                 grid3d              = -1;
-    int                 wheel               = -1;
-    int                 dense               = -1;
-    int                 timing_iterations   = -1;
-    float               alpha               = 1.0;
-    float               beta                = 0.0;
-
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose2 = args.CheckCmdLineFlag("v2");
-    g_quiet = args.CheckCmdLineFlag("quiet");
-    fp64 = args.CheckCmdLineFlag("fp64");
-    rcm_relabel = args.CheckCmdLineFlag("rcm");
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("mtx", mtx_filename);
-    args.GetCmdLineArgument("grid2d", grid2d);
-    args.GetCmdLineArgument("grid3d", grid3d);
-    args.GetCmdLineArgument("wheel", wheel);
-    args.GetCmdLineArgument("dense", dense);
-    args.GetCmdLineArgument("alpha", alpha);
-    args.GetCmdLineArgument("beta", beta);
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Run test(s)
-    if (fp64)
-    {
-        RunTests<double, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
-    }
-    else
-    {
-        RunTests<float, int>(rcm_relabel, alpha, beta, mtx_filename, grid2d, grid3d, wheel, dense, timing_iterations, args);
-    }
-
-    CubDebugExit(cudaDeviceSynchronize());
-    printf("\n");
-
-    return 0;
-}
diff --git a/external/cub/experimental/spmv_script.sh b/external/cub/experimental/spmv_script.sh
deleted file mode 100755
index f43204315a3..00000000000
--- a/external/cub/experimental/spmv_script.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
-do
-	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
-done
-
-echo
-echo
-
-for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
-do
-    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
-    then
-    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
-    fi
-done
-
-echo
-echo
-
-for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
-#for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
-do 
-    if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
-    then
-    	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
-    fi
-done 
-
diff --git a/external/cub/test/Makefile b/external/cub/test/Makefile
deleted file mode 100644
index 26d253594f2..00000000000
--- a/external/cub/test/Makefile
+++ /dev/null
@@ -1,453 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
-
-
-#-------------------------------------------------------------------------------
-#
-# Makefile usage
-#
-# make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>] [quickertest=<0|1>]
-#
-#-------------------------------------------------------------------------------
-
-include ../common.mk 
- 
-#-------------------------------------------------------------------------------
-# Commandline Options
-#-------------------------------------------------------------------------------
-
-# Testing mode option (quick/thorough)
-ifeq ($(quickertest), 1)
-	NVCCFLAGS += -DQUICKER_TEST
-	TEST_SUFFIX = quicker
-else ifeq ($(quicktest), 1)
-	NVCCFLAGS += -DQUICK_TEST
-	TEST_SUFFIX = quick
-else 
-	TEST_SUFFIX = thorough
-	NPPI = 
-endif
-
-
-# CUDA memcheck (enabled by default) 
-ifeq ($(memcheck), 0)
-	MEMCHECK = 
-else 
-	MEMCHECK = cuda-memcheck
-endif
-
-
-#-------------------------------------------------------------------------------
-# Compiler and compilation platform
-#-------------------------------------------------------------------------------
-
-# Includes
-INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
-
-# Suffix to append to each binary
-SUFFIX = $(BIN_SUFFIX)_$(TEST_SUFFIX)
-
-# Define test arch
-DEFINES += -DTEST_ARCH=$(TEST_ARCH)
-
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-DEPS =				$(CUB_DEPS) \
-					$(CUB_DIR)test/Makefile \
-					$(CUB_DIR)test/test_util.h \
-					$(CUB_DIR)test/mersenne.h \
-
-BLOCK_REDUCE = 		test_block_reduce_raking \
-	 				test_block_reduce_warp_reductions		
-
-
-BLOCK_SCAN = 		test_block_scan_raking \
-	 				test_block_scan_raking_memoize \
-	 				test_block_scan_warp_scans		
-
-
-BLOCK_RADIX_SORT = 	test_block_radix_sort_keys \
-	 				test_block_radix_sort_pairs	
-
-		
-ALL = 				link \
-	 				test_iterator \
-	 				test_allocator \
-	 				test_warp_scan \
-	 				test_warp_reduce \
-	 				$(BLOCK_REDUCE) \
-	 				$(BLOCK_SCAN) \
-	 				$(BLOCK_RADIX_SORT) \
-	 				test_block_load_store \
-	 				test_block_histogram \
-				 	test_device_reduce \
-			 		test_device_histogram \
-			 		test_device_scan \
-			 		test_device_radix_sort \
-					test_device_reduce_by_key\
-					test_device_run_length_encode\
-		 			test_device_select_unique \
-					test_device_select_if 
-		
-#	 	test_grid_barrier \		fails on sm110
-#	 	test_device_seg_reduce
-		
-
-
-#-------------------------------------------------------------------------------
-# make default
-#-------------------------------------------------------------------------------
-
-default:
-
-
-#-------------------------------------------------------------------------------
-# make clean
-#-------------------------------------------------------------------------------
-
-clean :
-	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
-	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
-
-
-#-------------------------------------------------------------------------------
-# make all
-#-------------------------------------------------------------------------------
-
-all : $(ALL)
-
-
-#-------------------------------------------------------------------------------
-# make run
-#-------------------------------------------------------------------------------
-
-run : 
-	for i in $(ALL); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
-
-run_block_reduce : 
-	for i in $(BLOCK_REDUCE); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
-
-run_block_scan : 
-	for i in $(BLOCK_SCAN); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
-
-run_block_radix_sort : 
-	for i in $(BLOCK_RADIX_SORT); do $(MEMCHECK) ./bin/$${i}_$(SUFFIX) --device=$(device) || exit 1; done
-
-
-
-#-------------------------------------------------------------------------------
-# make link
-#-------------------------------------------------------------------------------
-
-link : bin/link_$(SUFFIX)
-
-bin/link_$(SUFFIX) : link_a.cu link_b.cu link_main.cpp $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_a.cu -c -o bin/link_a.obj
-	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_b.cu -c -o bin/link_b.obj
-	$(NVCC) $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(DEFINES) $(SM_TARGETS) link_main.cpp bin/link_a.obj bin/link_b.obj -o bin/link_$(SUFFIX)
-
-
-#-------------------------------------------------------------------------------
-# make test_iterator 
-#-------------------------------------------------------------------------------
-
-test_iterator: bin/test_iterator_$(SUFFIX)
-
-bin/test_iterator_$(SUFFIX) : test_iterator.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_iterator_$(SUFFIX) test_iterator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_allocator 
-#-------------------------------------------------------------------------------
-
-test_allocator: bin/test_allocator_$(SUFFIX)
-
-bin/test_allocator_$(SUFFIX) : test_allocator.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_allocator_$(SUFFIX) test_allocator.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-	
-	
-#-------------------------------------------------------------------------------
-# make test_grid_barrier 
-#-------------------------------------------------------------------------------
-
-test_grid_barrier: bin/test_grid_barrier_$(SUFFIX)
-
-bin/test_grid_barrier_$(SUFFIX) : test_grid_barrier.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_grid_barrier_$(SUFFIX) test_grid_barrier.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-	
-
-#-------------------------------------------------------------------------------
-# make test_warp_scan 
-#-------------------------------------------------------------------------------
-
-test_warp_scan: bin/test_warp_scan_$(SUFFIX)
-
-bin/test_warp_scan_$(SUFFIX) : test_warp_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_scan_$(SUFFIX) test_warp_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_warp_reduce 
-#-------------------------------------------------------------------------------
-
-test_warp_reduce: bin/test_warp_reduce_$(SUFFIX)
-
-bin/test_warp_reduce_$(SUFFIX) : test_warp_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_warp_reduce_$(SUFFIX) test_warp_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_reduce_raking
-#-------------------------------------------------------------------------------
-
-test_block_reduce_raking: bin/test_block_reduce_raking_$(SUFFIX)
-
-bin/test_block_reduce_raking_$(SUFFIX) : test_block_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_reduce_raking_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_reduce_warp_reductions 
-#-------------------------------------------------------------------------------
-
-test_block_reduce_warp_reductions: bin/test_block_reduce_warp_reductions_$(SUFFIX)
-
-bin/test_block_reduce_warp_reductions_$(SUFFIX) : test_block_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_WARP_REDUCTIONS $(SM_TARGETS) -o bin/test_block_reduce_warp_reductions_$(SUFFIX) test_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_reduce 
-#-------------------------------------------------------------------------------
-
-test_block_reduce: $(BLOCK_REDUCE)
-
-
-#-------------------------------------------------------------------------------
-# make test_block_scan_raking
-#-------------------------------------------------------------------------------
-
-test_block_scan_raking: bin/test_block_scan_raking_$(SUFFIX)
-
-bin/test_block_scan_raking_$(SUFFIX) : test_block_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_RAKING $(SM_TARGETS) -o bin/test_block_scan_raking_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_scan_raking_memoize
-#-------------------------------------------------------------------------------
-
-test_block_scan_raking_memoize: bin/test_block_scan_raking_memoize_$(SUFFIX)
-
-bin/test_block_scan_raking_memoize_$(SUFFIX) : test_block_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_RAKING_MEMOIZE $(SM_TARGETS) -o bin/test_block_scan_raking_memoize_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_scan_warp_scans
-#-------------------------------------------------------------------------------
-
-test_block_scan_warp_scans: bin/test_block_scan_warp_scans_$(SUFFIX)
-
-bin/test_block_scan_warp_scans_$(SUFFIX) : test_block_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_WARP_SCANS $(SM_TARGETS) -o bin/test_block_scan_warp_scans_$(SUFFIX) test_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3  
-
-
-#-------------------------------------------------------------------------------
-# make test_block_scan 
-#-------------------------------------------------------------------------------
-
-test_block_scan: $(BLOCK_SCAN)
-
-
-#-------------------------------------------------------------------------------
-# make test_block_load_store 
-#-------------------------------------------------------------------------------
-
-test_block_load_store: bin/test_block_load_store_$(SUFFIX)
-
-bin/test_block_load_store_$(SUFFIX) : test_block_load_store.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_load_store_$(SUFFIX) test_block_load_store.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-	
-	
-#-------------------------------------------------------------------------------
-# make test_block_radix_sort_keys 
-#-------------------------------------------------------------------------------
-
-test_block_radix_sort_keys: bin/test_block_radix_sort_keys_$(SUFFIX)
-
-bin/test_block_radix_sort_keys_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) -DTEST_KEYS_ONLY $(SM_TARGETS) -o bin/test_block_radix_sort_keys_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make test_block_radix_sort_pairs 
-#-------------------------------------------------------------------------------
-
-test_block_radix_sort_pairs: bin/test_block_radix_sort_pairs_$(SUFFIX)
-
-bin/test_block_radix_sort_pairs_$(SUFFIX) : test_block_radix_sort.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_radix_sort_pairs_$(SUFFIX) test_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_block_radix_sort
-#-------------------------------------------------------------------------------
-
-test_block_radix_sort : $(BLOCK_RADIX_SORT)
-
-
-#-------------------------------------------------------------------------------
-# make test_block_histogram 
-#-------------------------------------------------------------------------------
-
-test_block_histogram: bin/test_block_histogram_$(SUFFIX)
-
-bin/test_block_histogram_$(SUFFIX) : test_block_histogram.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_block_histogram_$(SUFFIX) test_block_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_reduce
-#-------------------------------------------------------------------------------
-
-test_device_reduce: bin/test_device_reduce_$(SUFFIX)
-
-bin/test_device_reduce_$(SUFFIX) : test_device_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_$(SUFFIX) test_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_histogram
-#-------------------------------------------------------------------------------
-
-test_device_histogram: bin/test_device_histogram_$(SUFFIX)
-
-bin/test_device_histogram_$(SUFFIX) : test_device_histogram.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_histogram_$(SUFFIX) test_device_histogram.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) $(NPPI) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_scan
-#-------------------------------------------------------------------------------
-
-test_device_scan: bin/test_device_scan_$(SUFFIX)
-
-bin/test_device_scan_$(SUFFIX) : test_device_scan.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_scan_$(SUFFIX) test_device_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_radix_sort
-#-------------------------------------------------------------------------------
-
-test_device_radix_sort: bin/test_device_radix_sort_$(SUFFIX)
-
-bin/test_device_radix_sort_$(SUFFIX) : test_device_radix_sort.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_radix_sort_$(SUFFIX) test_device_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_select_unique
-#-------------------------------------------------------------------------------
-
-test_device_select_unique: bin/test_device_select_unique_$(SUFFIX)
-
-bin/test_device_select_unique_$(SUFFIX) : test_device_select_unique.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_unique_$(SUFFIX) test_device_select_unique.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-#-------------------------------------------------------------------------------
-# make test_device_select_if
-#-------------------------------------------------------------------------------
-
-test_device_select_if: bin/test_device_select_if_$(SUFFIX)
-
-bin/test_device_select_if_$(SUFFIX) : test_device_select_if.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_select_if_$(SUFFIX) test_device_select_if.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make test_device_reduce_by_key
-#-------------------------------------------------------------------------------
-
-test_device_reduce_by_key: bin/test_device_reduce_by_key_$(SUFFIX)
-
-bin/test_device_reduce_by_key_$(SUFFIX) : test_device_reduce_by_key.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_reduce_by_key_$(SUFFIX) test_device_reduce_by_key.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-#-------------------------------------------------------------------------------
-# make test_device_run_length_encode
-#-------------------------------------------------------------------------------
-
-test_device_run_length_encode: bin/test_device_run_length_encode_$(SUFFIX)
-
-bin/test_device_run_length_encode_$(SUFFIX) : test_device_run_length_encode.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_run_length_encode_$(SUFFIX) test_device_run_length_encode.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
-
-
-#-------------------------------------------------------------------------------
-# make test_device_seg_reduce
-#-------------------------------------------------------------------------------
-#
-#test_device_seg_reduce: bin/test_device_seg_reduce_$(SUFFIX)
-#
-#bin/test_device_seg_reduce_$(SUFFIX) : test_device_seg_reduce.cu $(DEPS)
-#	mkdir -p bin
-#	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/test_device_seg_reduce_$(SUFFIX) test_device_seg_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
-
-
diff --git a/external/cub/test/link_a.cu b/external/cub/test/link_a.cu
deleted file mode 100644
index 8a9b19f93d8..00000000000
--- a/external/cub/test/link_a.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <cub/cub.cuh>
-
-void a()
-{
-    printf("a() called\n");
-
-    cub::DoubleBuffer<unsigned int>     d_keys;
-    cub::DoubleBuffer<cub::NullType>    d_values;
-    size_t                              temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
-}
diff --git a/external/cub/test/link_b.cu b/external/cub/test/link_b.cu
deleted file mode 100644
index a19ec407d90..00000000000
--- a/external/cub/test/link_b.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-#include <cub/cub.cuh>
-
-void b()
-{
-    printf("b() called\n");
-
-    cub::DoubleBuffer<unsigned int>     d_keys;
-    cub::DoubleBuffer<cub::NullType>    d_values;
-    size_t                              temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
-}
diff --git a/external/cub/test/link_main.cpp b/external/cub/test/link_main.cpp
deleted file mode 100644
index ef677ee03b4..00000000000
--- a/external/cub/test/link_main.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-#include <stdio.h>
-
-extern void a();
-extern void b();
-
-int main()
-{
-    printf("hello world\n");
-    return 0;
-}
diff --git a/external/cub/test/mersenne.h b/external/cub/test/mersenne.h
deleted file mode 100644
index 76aae809d08..00000000000
--- a/external/cub/test/mersenne.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- A C-program for MT19937, with initialization improved 2002/1/26.
- Coded by Takuji Nishimura and Makoto Matsumoto.
-
- Before using, initialize the state by using init_genrand(seed)
- or init_by_array(init_key, key_length).
-
- Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. The names of its contributors may not be used to endorse or promote
- products derived from this software without specific prior written
- permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
- Any feedback is very welcome.
- http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
- email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
- */
-
-#include <stdio.h>
-
-namespace mersenne {
-
-/* Period parameters */
-const unsigned int N          = 624;
-const unsigned int M          = 397;
-const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
-const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
-const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
-
-static unsigned int mt[N];  /* the array for the state vector  */
-static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
-
-/* initializes mt[N] with a seed */
-void init_genrand(unsigned int s)
-{
-    mt[0] = s & 0xffffffff;
-    for (mti = 1; mti < N; mti++)
-    {
-        mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
-
-        /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
-        /* In the previous versions, MSBs of the seed affect   */
-        /* only MSBs of the array mt[].                        */
-        /* 2002/01/09 modified by Makoto Matsumoto             */
-
-        mt[mti] &= 0xffffffff;
-        /* for >32 bit machines */
-    }
-}
-
-/* initialize by an array with array-length */
-/* init_key is the array for initializing keys */
-/* key_length is its length */
-/* slight change for C++, 2004/2/26 */
-void init_by_array(unsigned int init_key[], int key_length)
-{
-    int i, j, k;
-    init_genrand(19650218);
-    i = 1;
-    j = 0;
-    k = (N > key_length ? N : key_length);
-    for (; k; k--)
-    {
-        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
-            + init_key[j] + j;  /* non linear */
-        mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
-        i++;
-        j++;
-        if (i >= N)
-        {
-            mt[0] = mt[N - 1];
-            i = 1;
-        }
-        if (j >= key_length) j = 0;
-    }
-    for (k = N - 1; k; k--)
-    {
-        mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
-        mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
-        i++;
-        if (i >= N)
-        {
-            mt[0] = mt[N - 1];
-            i = 1;
-        }
-    }
-
-    mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
-}
-
-/* generates a random number on [0,0xffffffff]-interval */
-unsigned int genrand_int32(void)
-{
-    unsigned int y;
-    static unsigned int mag01[2] = { 0x0, MATRIX_A };
-
-    /* mag01[x] = x * MATRIX_A  for x=0,1 */
-
-    if (mti >= N)
-    { /* generate N words at one time */
-        int kk;
-
-        if (mti == N + 1) /* if init_genrand() has not been called, */
-        init_genrand(5489); /* a defat initial seed is used */
-
-        for (kk = 0; kk < N - M; kk++)
-        {
-            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
-            mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
-        }
-        for (; kk < N - 1; kk++)
-        {
-            y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
-            mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
-        }
-        y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
-        mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
-
-        mti = 0;
-    }
-
-    y = mt[mti++];
-
-    /* Tempering */
-    y ^= (y >> 11);
-    y ^= (y << 7) & 0x9d2c5680;
-    y ^= (y << 15) & 0xefc60000;
-    y ^= (y >> 18);
-
-    return y;
-}
-
-
-
-} // namespace mersenne
diff --git a/external/cub/test/test_allocator.cu b/external/cub/test/test_allocator.cu
deleted file mode 100644
index 8176db68be0..00000000000
--- a/external/cub/test/test_allocator.cu
+++ /dev/null
@@ -1,459 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test evaluation for caching allocator of device memory
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/util_allocator.cuh>
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>]"
-            "[--bytes=<timing bytes>]"
-            "[--i=<timing iterations>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-#if (CUB_PTX_ARCH == 0)
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get number of GPUs and current GPU
-    int num_gpus;
-    int initial_gpu;
-    int timing_iterations           = 10000;
-    int timing_bytes                = 1024 * 1024;
-
-    if (CubDebug(cudaGetDeviceCount(&num_gpus))) exit(1);
-    if (CubDebug(cudaGetDevice(&initial_gpu))) exit(1);
-    args.GetCmdLineArgument("i", timing_iterations);
-    args.GetCmdLineArgument("bytes", timing_bytes);
-
-    // Create default allocator (caches up to 6MB in device allocations per GPU)
-    CachingDeviceAllocator allocator;
-    allocator.debug = true;
-
-    printf("Running single-gpu tests...\n"); fflush(stdout);
-
-    //
-    // Test0
-    //
-
-    // Create a new stream
-    cudaStream_t other_stream;
-    CubDebugExit(cudaStreamCreate(&other_stream));
-
-    // Allocate 999 bytes on the current gpu in stream0
-    char *d_999B_stream0_a;
-    char *d_999B_stream0_b;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-
-    // Run some big kernel in stream 0
-    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
-
-    // Free d_999B_stream0_a
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-
-    // Allocate another 999 bytes in stream 0
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Run some big kernel in stream 0
-    EmptyKernel<void><<<32000, 512, 1024 * 8, 0>>>();
-
-    // Free d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Allocate 999 bytes on the current gpu in other_stream
-    char *d_999B_stream_other_a;
-    char *d_999B_stream_other_b;
-    allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream);
-
-    // Check that that we have 1 live blocks on the initial GPU (that we allocated a new one because d_999B_stream0_b is only available for stream 0 until it becomes idle)
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that that we have one cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    // Run some big kernel in other_stream
-    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
-
-    // Free d_999B_stream_other
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
-
-    // Check that we can now use both allocations in stream 0 after synchronizing the device
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Free d_999B_stream0_a and d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Check that we can now use both allocations in other_stream
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_a, 999, other_stream));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream_other_b, 999, other_stream));
-
-    // Check that that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Run some big kernel in other_stream
-    EmptyKernel<void><<<32000, 512, 1024 * 8, other_stream>>>();
-
-    // Free d_999B_stream_other_a and d_999B_stream_other_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream_other_b));
-
-    // Check that we can now use both allocations in stream 0 after synchronizing the device and destroying the other stream
-    CubDebugExit(cudaDeviceSynchronize());
-    CubDebugExit(cudaStreamDestroy(other_stream));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_a, 999, 0));
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_999B_stream0_b, 999, 0));
-
-    // Check that that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    // Check that that we have no cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Free d_999B_stream0_a and d_999B_stream0_b
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_a));
-    CubDebugExit(allocator.DeviceFree(d_999B_stream0_b));
-
-    // Free all cached
-    CubDebugExit(allocator.FreeAllCached());
-
-    //
-    // Test1
-    //
-
-    // Allocate 5 bytes on the current gpu
-    char *d_5B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_5B, 5));
-
-    // Check that that we have zero free bytes cached on the initial GPU
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
-
-    // Check that that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    //
-    // Test2
-    //
-
-    // Allocate 4096 bytes on the current gpu
-    char *d_4096B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_4096B, 4096));
-
-    // Check that that we have 2 live blocks on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 2);
-
-    //
-    // Test3
-    //
-
-    // DeviceFree d_5B
-    CubDebugExit(allocator.DeviceFree(d_5B));
-
-    // Check that that we have min_bin_bytes free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that that we have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test4
-    //
-
-    // DeviceFree d_4096B
-    CubDebugExit(allocator.DeviceFree(d_4096B));
-
-    // Check that that we have the 4096 + min_bin free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes + 4096);
-
-    // Check that that we have 0 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 0);
-
-    // Check that that we have 2 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 2);
-
-    //
-    // Test5
-    //
-
-    // Allocate 768 bytes on the current gpu
-    char *d_768B;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_768B, 768));
-
-    // Check that that we have the min_bin free bytes cached on the initial gpu (4096 was reused)
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that that we have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test6
-    //
-
-    // Allocate max_cached_bytes on the current gpu
-    char *d_max_cached;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached, allocator.max_cached_bytes));
-
-    // DeviceFree d_max_cached
-    CubDebugExit(allocator.DeviceFree(d_max_cached));
-
-    // Check that that we have the min_bin free bytes cached on the initial gpu (max cached was not returned because we went over)
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, allocator.min_bin_bytes);
-
-    // Check that that we have 1 live block on the initial GPU
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    // Check that that we still have 1 cached block on the initial GPU
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    //
-    // Test7
-    //
-
-    // Free all cached blocks on all GPUs
-    CubDebugExit(allocator.FreeAllCached());
-
-    // Check that that we have 0 bytes cached on the initial GPU
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, 0);
-
-    // Check that that we have 0 cached blocks across all GPUs
-    AssertEquals(allocator.cached_blocks.size(), 0);
-
-    // Check that that still we have 1 live block across all GPUs
-    AssertEquals(allocator.live_blocks.size(), 1);
-
-    //
-    // Test8
-    //
-
-    // Allocate max cached bytes + 1 on the current gpu
-    char *d_max_cached_plus;
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_max_cached_plus, allocator.max_cached_bytes + 1));
-
-    // DeviceFree max cached bytes
-    CubDebugExit(allocator.DeviceFree(d_max_cached_plus));
-
-    // DeviceFree d_768B
-    CubDebugExit(allocator.DeviceFree(d_768B));
-
-    unsigned int power;
-    size_t rounded_bytes;
-    allocator.NearestPowerOf(power, rounded_bytes, allocator.bin_growth, 768);
-
-    // Check that that we have 4096 free bytes cached on the initial gpu
-    AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
-
-    // Check that that we have 1 cached blocks across all GPUs
-    AssertEquals(allocator.cached_blocks.size(), 1);
-
-    // Check that that still we have 0 live block across all GPUs
-    AssertEquals(allocator.live_blocks.size(), 0);
-
-#ifndef CUB_CDP
-    // BUG: find out why these tests fail when one GPU is CDP compliant and the other is not
-
-    if (num_gpus > 1)
-    {
-        printf("\nRunning multi-gpu tests...\n"); fflush(stdout);
-
-        //
-        // Test9
-        //
-
-        // Allocate 768 bytes on the next gpu
-        int next_gpu = (initial_gpu + 1) % num_gpus;
-        char *d_768B_2;
-        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
-
-        // DeviceFree d_768B on the next gpu
-        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
-
-        // Re-allocate 768 bytes on the next gpu
-        CubDebugExit(allocator.DeviceAllocate(next_gpu, (void **) &d_768B_2, 768));
-
-        // Re-free d_768B on the next gpu
-        CubDebugExit(allocator.DeviceFree(next_gpu, d_768B_2));
-
-        // Check that that we have 4096 free bytes cached on the initial gpu
-        AssertEquals(allocator.cached_bytes[initial_gpu].free, rounded_bytes);
-
-        // Check that that we have 4096 free bytes cached on the second gpu
-        AssertEquals(allocator.cached_bytes[next_gpu].free, rounded_bytes);
-
-        // Check that that we have 2 cached blocks across all GPUs
-        AssertEquals(allocator.cached_blocks.size(), 2);
-
-        // Check that that still we have 0 live block across all GPUs
-        AssertEquals(allocator.live_blocks.size(), 0);
-    }
-#endif  // CUB_CDP
-
-    //
-    // Performance
-    //
-
-    printf("\nCPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
-    fflush(stdout); fflush(stderr);
-
-    // CPU performance comparisons vs cached.  Allocate and free a 1MB block 2000 times
-    CpuTimer    cpu_timer;
-    char        *d_1024MB                       = NULL;
-    allocator.debug                             = false;
-
-    // Prime the caching allocator and the kernel
-    CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-    CubDebugExit(allocator.DeviceFree(d_1024MB));
-    cub::EmptyKernel<void><<<1, 32>>>();
-
-    // CUDA
-    cpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
-        CubDebugExit(cudaFree(d_1024MB));
-    }
-    cpu_timer.Stop();
-    float cuda_malloc_elapsed_millis = cpu_timer.ElapsedMillis();
-
-    // CUB
-    cpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-        CubDebugExit(allocator.DeviceFree(d_1024MB));
-    }
-    cpu_timer.Stop();
-    float cub_calloc_elapsed_millis = cpu_timer.ElapsedMillis();
-
-    printf("\t CUB CachingDeviceAllocator allocation CPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
-        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
-        cuda_malloc_elapsed_millis / timing_iterations,
-        cub_calloc_elapsed_millis / timing_iterations);
-
-    // GPU performance comparisons.  Allocate and free a 1MB block 2000 times
-    GpuTimer gpu_timer;
-
-    printf("\nGPU Performance (%d timing iterations, %d bytes):\n", timing_iterations, timing_bytes);
-    fflush(stdout); fflush(stderr);
-
-    // Kernel-only
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        cub::EmptyKernel<void><<<1, 32>>>();
-    }
-    gpu_timer.Stop();
-    float cuda_empty_elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // CUDA
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(cudaMalloc((void **) &d_1024MB, timing_bytes));
-        cub::EmptyKernel<void><<<1, 32>>>();
-        CubDebugExit(cudaFree(d_1024MB));
-    }
-    gpu_timer.Stop();
-    cuda_malloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
-
-    // CUB
-    gpu_timer.Start();
-    for (int i = 0; i < timing_iterations; ++i)
-    {
-        CubDebugExit(allocator.DeviceAllocate((void **) &d_1024MB, timing_bytes));
-        cub::EmptyKernel<void><<<1, 32>>>();
-        CubDebugExit(allocator.DeviceFree(d_1024MB));
-    }
-    gpu_timer.Stop();
-    cub_calloc_elapsed_millis = gpu_timer.ElapsedMillis() - cuda_empty_elapsed_millis;
-
-    printf("\t CUB CachingDeviceAllocator allocation GPU speedup: %.2f (avg cudaMalloc %.4f ms vs. avg DeviceAllocate %.4f ms)\n",
-        cuda_malloc_elapsed_millis / cub_calloc_elapsed_millis,
-        cuda_malloc_elapsed_millis / timing_iterations,
-        cub_calloc_elapsed_millis / timing_iterations);
-
-
-#endif
-
-    printf("Success\n");
-
-    return 0;
-}
-
diff --git a/external/cub/test/test_block_histogram.cu b/external/cub/test/test_block_histogram.cu
deleted file mode 100644
index 1b61341d984..00000000000
--- a/external/cub/test/test_block_histogram.cu
+++ /dev/null
@@ -1,310 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockHistogram utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <string>
-#include <typeinfo>
-
-#include <cub/block/block_histogram.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * BlockHistogram test kernel.
- */
-template <
-    int                     BINS,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm ALGORITHM,
-    typename                T,
-    typename                HistoCounter>
-__global__ void BlockHistogramKernel(
-    T                       *d_samples,
-    HistoCounter            *d_histogram)
-{
-    // Parameterize BlockHistogram type for our thread block
-    typedef BlockHistogram<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS, ALGORITHM> BlockHistogram;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockHistogram::TempStorage temp_storage;
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_samples, data);
-
-    // Test histo (writing directly to histogram buffer in global)
-    BlockHistogram(temp_storage).Histogram(data, d_histogram);
-}
-
-
-/**
- * Initialize problem (and solution)
- */
-template <
-    int             BINS,
-    typename        SampleT>
-void Initialize(
-    GenMode         gen_mode,
-    SampleT         *h_samples,
-    int             *h_histograms_linear,
-    int             num_samples)
-{
-    // Init bins
-    for (int bin = 0; bin < BINS; ++bin)
-    {
-        h_histograms_linear[bin] = 0;
-    }
-
-    if (g_verbose) printf("Samples: \n");
-
-    // Initialize interleaved channel samples and histogram them correspondingly
-    for (int i = 0; i < num_samples; ++i)
-    {
-        InitValue(gen_mode, h_samples[i], i);
-        h_samples[i] %= BINS;
-
-        if (g_verbose) std::cout << CoutCast(h_samples[i]) << ", ";
-
-        h_histograms_linear[h_samples[i]]++;
-    }
-
-    if (g_verbose) printf("\n\n");
-}
-
-
-/**
- * Test BlockHistogram
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm     ALGORITHM>
-void Test(
-    GenMode                     gen_mode)
-{
-    int num_samples = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    printf("cub::BlockHistogram %s %d %s samples (%dB), %d bins, %d threads, gen-mode %s\n",
-        (ALGORITHM == BLOCK_HISTO_SORT) ? "BLOCK_HISTO_SORT" : "BLOCK_HISTO_ATOMIC",
-        num_samples,
-        typeid(SampleT).name(),
-        (int) sizeof(SampleT),
-        BINS,
-        BLOCK_THREADS,
-        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
-    fflush(stdout);
-
-    // Allocate host arrays
-    SampleT         *h_samples          = new SampleT[num_samples];
-    int   *h_reference = new int[BINS];
-
-    // Initialize problem
-    Initialize<BINS>(gen_mode, h_samples, h_reference, num_samples);
-
-    // Allocate problem device arrays
-    SampleT         *d_samples = NULL;
-    int             *d_histogram = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples,             sizeof(SampleT) * num_samples));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram,   sizeof(int) * BINS));
-
-    // Initialize/clear device arrays
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * num_samples, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_histogram, 0, sizeof(int) * BINS));
-
-    // Run kernel
-    BlockHistogramKernel<BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<1, BLOCK_THREADS>>>(
-        d_samples,
-        d_histogram);
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults((int*) h_reference, d_histogram, BINS, g_verbose, g_verbose);
-    printf("\t%s\n\n", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-    if (h_reference) delete[] h_reference;
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-    if (d_histogram) CubDebugExit(g_allocator.DeviceFree(d_histogram));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test different sample distributions
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD,
-    BlockHistogramAlgorithm     ALGORITHM>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(UNIFORM);
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(INTEGER_SEED);
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>(RANDOM);
-}
-
-
-/**
- * Test different ALGORITHM
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS,
-    int                         ITEMS_PER_THREAD>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_SORT>();
-    Test<SampleT, BINS, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_HISTO_ATOMIC>();
-}
-
-
-/**
- * Test different ITEMS_PER_THREAD
- */
-template <
-    typename                    SampleT,
-    int                         BINS,
-    int                         BLOCK_THREADS>
-void Test()
-{
-    Test<SampleT, BINS, BLOCK_THREADS, 1>();
-    Test<SampleT, BINS, BLOCK_THREADS, 5>();
-}
-
-
-/**
- * Test different BLOCK_THREADS
- */
-template <
-    typename                    SampleT,
-    int                         BINS>
-void Test()
-{
-    Test<SampleT, BINS, 32>();
-    Test<SampleT, BINS, 96>();
-    Test<SampleT, BINS, 128>();
-}
-
-
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<total input samples across all channels> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_SORT>(RANDOM);
-    Test<unsigned char, 256, 128, 4, BLOCK_HISTO_ATOMIC>(RANDOM);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        Test<unsigned char, 32>();
-        Test<unsigned char, 256>();
-        Test<unsigned short, 1024>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_block_load_store.cu b/external/cub/test/test_block_load_store.cu
deleted file mode 100644
index ca6ef1310f7..00000000000
--- a/external/cub/test/test_block_load_store.cu
+++ /dev/null
@@ -1,549 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockLoad and BlockStore utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <stdio.h>
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/iterator/cache_modified_input_iterator.cuh>
-#include <cub/iterator/cache_modified_output_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/**
- * Test load/store kernel.
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    typename            InputIteratorT,
-    typename            OutputIteratorT>
-__launch_bounds__ (BLOCK_THREADS, 1)
-__global__ void Kernel(
-    InputIteratorT    d_in,
-    OutputIteratorT    d_out_unguarded,
-    OutputIteratorT    d_out_guarded,
-    int               num_items)
-{
-    enum
-    {
-        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD
-    };
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Threadblock load/store abstraction types
-    typedef BlockLoad<InputT, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
-    typedef BlockStore<OutputT, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
-
-    // Shared memory type for this thread block
-    union TempStorage
-    {
-        typename BlockLoad::TempStorage     load;
-        typename BlockStore::TempStorage    store;
-    };
-
-    // Allocate temp storage in shared memory
-    __shared__ TempStorage temp_storage;
-
-    // Threadblock work bounds
-    int block_offset = blockIdx.x * TILE_SIZE;
-    int guarded_elements = num_items - block_offset;
-
-    // Tile of items
-    OutputT data[ITEMS_PER_THREAD];
-
-    // Load data
-    BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
-
-    __syncthreads();
-
-    // Store data
-    BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
-
-    __syncthreads();
-
-    // reset data
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        data[ITEM] = OutputT();
-
-    __syncthreads();
-
-    // Load data
-    BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
-
-    __syncthreads();
-
-    // Store data
-    BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
-}
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Test load/store variants
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    typename            InputIteratorT,
-    typename            OutputIteratorT>
-void TestKernel(
-    T                   *h_in,
-    InputIteratorT      d_in,
-    OutputIteratorT      d_out_unguarded_itr,
-    OutputIteratorT      d_out_guarded_itr,
-    T                   *d_out_unguarded_ptr,
-    T                   *d_out_guarded_ptr,
-    int                 grid_size,
-    int                 guarded_elements)
-{
-    int compare;
-
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Test with discard output iterator
-    typedef typename std::iterator_traits<InputIteratorT>::difference_type OffsetT;
-    DiscardOutputIterator<OffsetT> discard_itr;
-
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
-        <<<grid_size, BLOCK_THREADS>>>(
-            d_in,
-            discard_itr,
-            discard_itr,
-            guarded_elements);
-
-    // Test with regular output iterator
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>
-        <<<grid_size, BLOCK_THREADS>>>(
-            d_in,
-            d_out_unguarded_itr,
-            d_out_guarded_itr,
-            guarded_elements);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    compare = CompareDeviceResults(h_in, d_out_guarded_ptr, guarded_elements, g_verbose, g_verbose);
-    printf("\tGuarded: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check results
-    compare = CompareDeviceResults(h_in, d_out_unguarded_ptr, unguarded_elements, g_verbose, g_verbose);
-    printf("\tUnguarded: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test native pointer.  Specialized for sufficient resources
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM>
-void TestNative(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<true>      sufficient_resources)
-{
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-    int guarded_elements = int(fraction_valid * float(unguarded_elements));
-
-    // Allocate host arrays
-    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
-
-    // Allocate device arrays
-    T *d_in = NULL;
-    T *d_out_unguarded = NULL;
-    T *d_out_guarded = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
-    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
-    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
-
-    // Initialize problem on host and device
-    for (int i = 0; i < unguarded_elements; ++i)
-    {
-        InitValue(INTEGER_SEED, h_in[i], i);
-    }
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
-
-    printf("TestNative "
-        "grid_size(%d) "
-        "guarded_elements(%d) "
-        "unguarded_elements(%d) "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "LOAD_ALGORITHM(%d) "
-        "STORE_ALGORITHM(%d) "
-        "sizeof(T)(%d)\n",
-            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, (int) sizeof(T));
-
-    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
-        h_in,
-        (T const *) d_in,   // Test const
-        d_out_unguarded,
-        d_out_guarded,
-        d_out_unguarded,
-        d_out_guarded,
-        grid_size,
-        guarded_elements);
-
-    // Cleanup
-    if (h_in) free(h_in);
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
-    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
-}
-
-
-/**
- * Test native pointer.  Specialized for insufficient resources
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM>
-void TestNative(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<false>      sufficient_resources)
-{}
-
-
-/**
- * Test iterator.  Specialized for sufficient resources.
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    CacheLoadModifier   LOAD_MODIFIER,
-    CacheStoreModifier  STORE_MODIFIER>
-void TestIterator(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<true>      sufficient_resources)
-{
-    int unguarded_elements = grid_size * BLOCK_THREADS * ITEMS_PER_THREAD;
-    int guarded_elements = int(fraction_valid * float(unguarded_elements));
-
-    // Allocate host arrays
-    T *h_in = (T*) malloc(unguarded_elements * sizeof(T));
-
-    // Allocate device arrays
-    T *d_in = NULL;
-    T *d_out_unguarded = NULL;
-    T *d_out_guarded = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_unguarded, sizeof(T) * unguarded_elements));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out_guarded, sizeof(T) * guarded_elements));
-    CubDebugExit(cudaMemset(d_out_unguarded, 0, sizeof(T) * unguarded_elements));
-    CubDebugExit(cudaMemset(d_out_guarded, 0, sizeof(T) * guarded_elements));
-
-    // Initialize problem on host and device
-    for (int i = 0; i < unguarded_elements; ++i)
-    {
-        InitValue(INTEGER_SEED, h_in[i], i);
-    }
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * unguarded_elements, cudaMemcpyHostToDevice));
-
-    printf("TestIterator "
-        "grid_size(%d) "
-        "guarded_elements(%d) "
-        "unguarded_elements(%d) "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "LOAD_ALGORITHM(%d) "
-        "STORE_ALGORITHM(%d) "
-        "LOAD_MODIFIER(%d) "
-        "STORE_MODIFIER(%d) "
-        "sizeof(T)(%d)\n",
-            grid_size, guarded_elements, unguarded_elements, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_MODIFIER, STORE_MODIFIER, (int) sizeof(T));
-
-    TestKernel<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(
-        h_in,
-        CacheModifiedInputIterator<LOAD_MODIFIER, T>(d_in),
-        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_unguarded),
-        CacheModifiedOutputIterator<STORE_MODIFIER, T>(d_out_guarded),
-        d_out_unguarded,
-        d_out_guarded,
-        grid_size,
-        guarded_elements);
-
-    // Cleanup
-    if (h_in) free(h_in);
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out_unguarded) CubDebugExit(g_allocator.DeviceFree(d_out_unguarded));
-    if (d_out_guarded) CubDebugExit(g_allocator.DeviceFree(d_out_guarded));
-}
-
-/**
- * Test iterator.  Specialized for insufficient resources.
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  LOAD_ALGORITHM,
-    BlockStoreAlgorithm STORE_ALGORITHM,
-    CacheLoadModifier   LOAD_MODIFIER,
-    CacheStoreModifier  STORE_MODIFIER>
-void TestIterator(
-    int                 grid_size,
-    float               fraction_valid,
-    Int2Type<false>     sufficient_resources)
-{}
-
-
-/**
- * Evaluate different pointer access types
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockLoadAlgorithm      LOAD_ALGORITHM,
-    BlockStoreAlgorithm     STORE_ALGORITHM>
-void TestPointerType(
-    int             grid_size,
-    float           fraction_valid)
-{
-    // Threadblock load/store abstraction types
-    typedef BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM> BlockLoad;
-    typedef BlockStore<T, BLOCK_THREADS, ITEMS_PER_THREAD, STORE_ALGORITHM> BlockStore;
-
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 16;
-    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 16;
-    static const bool sufficient_threads    = BLOCK_THREADS <= 512;
-#else
-    static const bool sufficient_load_smem  = sizeof(typename BlockLoad::TempStorage)   <= 1024 * 48;
-    static const bool sufficient_store_smem = sizeof(typename BlockStore::TempStorage)  <= 1024 * 48;
-    static const bool sufficient_threads    = BLOCK_THREADS <= 1024;
-#endif
-
-    static const bool sufficient_resources  = sufficient_load_smem && sufficient_store_smem && sufficient_threads;
-
-    TestNative<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
-    TestIterator<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, LOAD_DEFAULT, STORE_DEFAULT>(grid_size, fraction_valid, Int2Type<sufficient_resources>());
-}
-
-
-/**
- * Evaluate different time-slicing strategies
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockLoadAlgorithm      LOAD_ALGORITHM,
-    BlockStoreAlgorithm     STORE_ALGORITHM>
-void TestSlicedStrategy(
-    int             grid_size,
-    float           fraction_valid)
-{
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, true>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, LOAD_ALGORITHM, STORE_ALGORITHM, false>(grid_size, fraction_valid);
-}
-
-
-
-/**
- * Evaluate different load/store strategies (specialized for block sizes that are not a multiple of 32)
- */
-template <
-    typename        T,
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD>
-void TestStrategy(
-    int             grid_size,
-    float           fraction_valid,
-    Int2Type<false> is_warp_multiple)
-{
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_DIRECT, BLOCK_STORE_DIRECT>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, BLOCK_STORE_TRANSPOSE>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_VECTORIZE, BLOCK_STORE_VECTORIZE>(grid_size, fraction_valid);
-}
-
-
-/**
- * Evaluate different load/store strategies (specialized for block sizes that are a multiple of 32)
- */
-template <
-    typename        T,
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD>
-void TestStrategy(
-    int             grid_size,
-    float           fraction_valid,
-    Int2Type<true>  is_warp_multiple)
-{
-    TestStrategy<T, BLOCK_THREADS, ITEMS_PER_THREAD>(grid_size, fraction_valid, Int2Type<false>());
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(grid_size, fraction_valid);
-    TestPointerType<T, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED>(grid_size, fraction_valid);
-}
-
-
-/**
- * Evaluate different register blocking
- */
-template <
-    typename T,
-    int BLOCK_THREADS>
-void TestItemsPerThread(
-    int grid_size,
-    float fraction_valid)
-{
-    Int2Type<BLOCK_THREADS % 32 == 0> is_warp_multiple;
-
-    TestStrategy<T, BLOCK_THREADS, 1>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 3>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 4>(grid_size, fraction_valid, is_warp_multiple);
-    TestStrategy<T, BLOCK_THREADS, 11>(grid_size, fraction_valid, is_warp_multiple);
-}
-
-
-/**
- * Evaluate different thread block sizes
- */
-template <typename T>
-void TestThreads(
-    int grid_size,
-    float fraction_valid)
-{
-    TestItemsPerThread<T, 15>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 32>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 72>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 96>(grid_size, fraction_valid);
-    TestItemsPerThread<T, 128>(grid_size, fraction_valid);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    TestNative<     int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE>(1, 0.8f, Int2Type<true>());
-    TestIterator<   int, 64, 2, BLOCK_LOAD_WARP_TRANSPOSE, BLOCK_STORE_WARP_TRANSPOSE, LOAD_DEFAULT, STORE_DEFAULT>(1, 0.8f, Int2Type<true>());
-
-#else
-
-    // Compile/run thorough tests
-    TestThreads<char>(2, 0.8f);
-    TestThreads<int>(2, 0.8f);
-    TestThreads<long>(2, 0.8f);
-    TestThreads<long2>(2, 0.8f);
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        TestThreads<double2>(2, 0.8f);
-    TestThreads<TestFoo>(2, 0.8f);
-    TestThreads<TestBar>(2, 0.8f);
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_block_radix_sort.cu b/external/cub/test/test_block_radix_sort.cu
deleted file mode 100644
index b3418dae6b2..00000000000
--- a/external/cub/test/test_block_radix_sort.cu
+++ /dev/null
@@ -1,717 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockRadixSort utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-#include <iostream>
-
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/// Specialized descending, blocked -> blocked
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<true>              is_descending,
-    Int2Type<true>              is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortDescending(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectBlocked(threadIdx.x, d_keys, keys);
-    StoreDirectBlocked(threadIdx.x, d_values, values);
-}
-
-/// Specialized descending, blocked -> striped
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<true>              is_descending,
-    Int2Type<false>             is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortDescendingBlockedToStriped(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
-}
-
-/// Specialized ascending, blocked -> blocked
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<false>             is_descending,
-    Int2Type<true>              is_blocked_output)
-{
-    BlockRadixSort(temp_storage).Sort(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectBlocked(threadIdx.x, d_keys, keys);
-    StoreDirectBlocked(threadIdx.x, d_values, values);
-}
-
-/// Specialized ascending, blocked -> striped
-template <int BLOCK_THREADS, typename BlockRadixSort, int ITEMS_PER_THREAD, typename Key, typename Value>
-__device__ __forceinline__ void TestBlockSort(
-    typename BlockRadixSort::TempStorage &temp_storage,
-    Key                         (&keys)[ITEMS_PER_THREAD],
-    Value                       (&values)[ITEMS_PER_THREAD],
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     &stop,
-    Int2Type<false>             is_descending,
-    Int2Type<false>             is_blocked_output)
-{
-    BlockRadixSort(temp_storage).SortBlockedToStriped(keys, values, begin_bit, end_bit);
-    stop = clock();
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys, keys);
-    StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values, values);
-}
-
-
-
-/**
- * BlockRadixSort kernel
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    int                 RADIX_BITS,
-    bool                MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm  INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig SMEM_CONFIG,
-    int                 DESCENDING,
-    int                 BLOCKED_OUTPUT,
-    typename            Key,
-    typename            Value>
-__launch_bounds__ (BLOCK_THREADS, 1)
-__global__ void Kernel(
-    Key                         *d_keys,
-    Value                       *d_values,
-    int                         begin_bit,
-    int                         end_bit,
-    clock_t                     *d_elapsed)
-{
-    // Threadblock load/store abstraction types
-    typedef BlockRadixSort<
-            Key,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            Value,
-            RADIX_BITS,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG>
-        BlockRadixSortT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockRadixSortT::TempStorage temp_storage;
-
-    // Items per thread
-    Key     keys[ITEMS_PER_THREAD];
-    Value   values[ITEMS_PER_THREAD];
-
-    LoadDirectBlocked(threadIdx.x, d_keys, keys);
-    LoadDirectBlocked(threadIdx.x, d_values, values);
-
-    // Start cycle timer
-    clock_t stop;
-    clock_t start = clock();
-
-    TestBlockSort<BLOCK_THREADS, BlockRadixSortT>(
-        temp_storage, keys, values, d_keys, d_values, begin_bit, end_bit, stop, Int2Type<DESCENDING>(), Int2Type<BLOCKED_OUTPUT>());
-
-    // Store time
-    if (threadIdx.x == 0)
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-}
-
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Simple key-value pairing
- */
-template <
-    typename Key,
-    typename Value,
-    bool IS_FLOAT = (Traits<Key>::CATEGORY == FLOATING_POINT)>
-struct Pair
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-/**
- * Simple key-value pairing (specialized for floating point types)
- */
-template <typename Key, typename Value>
-struct Pair<Key, Value, true>
-{
-    Key     key;
-    Value   value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // Key in unsigned bits
-        typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-        // Return true if key is negative zero and b.key is positive zero
-        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&key));
-        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<Key*>(&b.key));
-        UnsignedBits HIGH_BIT   = Traits<Key>::HIGH_BIT;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key-value sorting problem.
- */
-template <bool DESCENDING, typename Key, typename Value>
-void Initialize(
-    GenMode         gen_mode,
-    Key             *h_keys,
-    Value           *h_values,
-    Key             *h_reference_keys,
-    Value           *h_reference_values,
-    int             num_items,
-    int             entropy_reduction,
-    int             begin_bit,
-    int             end_bit)
-{
-    Pair<Key, Value> *h_pairs = new Pair<Key, Value>[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_keys[i], i);
-
-        RandomBits(h_values[i]);
-
-        // Mask off unwanted portions
-        int num_bits = end_bit - begin_bit;
-        if ((begin_bit > 0) || (end_bit < sizeof(Key) * 8))
-        {
-            unsigned long long base = 0;
-            memcpy(&base, &h_keys[i], sizeof(Key));
-            base &= ((1ull << num_bits) - 1) << begin_bit;
-            memcpy(&h_keys[i], &base, sizeof(Key));
-        }
-
-        h_pairs[i].key    = h_keys[i];
-        h_pairs[i].value  = h_values[i];
-    }
-
-    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
-    std::stable_sort(h_pairs, h_pairs + num_items);
-    if (DESCENDING) std::reverse(h_pairs, h_pairs + num_items);
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_keys[i]     = h_pairs[i].key;
-        h_reference_values[i]   = h_pairs[i].value;
-    }
-
-    delete[] h_pairs;
-}
-
-
-
-
-/**
- * Test BlockRadixSort kernel
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestDriver(
-    GenMode                 gen_mode,
-    int                     entropy_reduction,
-    int                     begin_bit,
-    int                     end_bit)
-{
-    enum
-    {
-        TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD,
-        KEYS_ONLY = Equals<Value, NullType>::VALUE,
-    };
-
-    // Allocate host arrays
-    Key     *h_keys             = new Key[TILE_SIZE];
-    Key     *h_reference_keys   = new Key[TILE_SIZE];
-    Value   *h_values           = new Value[TILE_SIZE];
-    Value   *h_reference_values = new Value[TILE_SIZE];
-
-    // Allocate device arrays
-    Key     *d_keys     = NULL;
-    Value   *d_values   = NULL;
-    clock_t *d_elapsed  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys, sizeof(Key) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-
-    // Initialize problem and solution on host
-    Initialize<DESCENDING>(gen_mode, h_keys, h_values, h_reference_keys, h_reference_values,
-        TILE_SIZE, entropy_reduction, begin_bit, end_bit);
-
-    // Copy problem to device
-    CubDebugExit(cudaMemcpy(d_keys, h_keys, sizeof(Key) * TILE_SIZE, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * TILE_SIZE, cudaMemcpyHostToDevice));
-
-    printf("%s "
-        "BLOCK_THREADS(%d) "
-        "ITEMS_PER_THREAD(%d) "
-        "RADIX_BITS(%d) "
-        "MEMOIZE_OUTER_SCAN(%d) "
-        "INNER_SCAN_ALGORITHM(%d) "
-        "SMEM_CONFIG(%d) "
-        "DESCENDING(%d) "
-        "BLOCKED_OUTPUT(%d) "
-        "sizeof(Key)(%d) "
-        "sizeof(Value)(%d) "
-        "gen_mode(%d), "
-        "entropy_reduction(%d) "
-        "begin_bit(%d) "
-        "end_bit(%d), "
-        "samples(%d)\n",
-            ((KEYS_ONLY) ? "Keys-only" : "Key-value"),
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            RADIX_BITS,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            DESCENDING,
-            BLOCKED_OUTPUT,
-            (int) sizeof(Key),
-            (int) sizeof(Value),
-            gen_mode,
-            entropy_reduction,
-            begin_bit,
-            end_bit,
-            g_num_rand_samples);
-
-    // Set shared memory config
-    cudaDeviceSetSharedMemConfig(SMEM_CONFIG);
-
-    // Run kernel
-    Kernel<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT><<<1, BLOCK_THREADS>>>(
-        d_keys, d_values, begin_bit, end_bit, d_elapsed);
-
-    // Flush kernel output / errors
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check keys results
-    printf("\tKeys: ");
-    int compare = CompareDeviceResults(h_reference_keys, d_keys, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check value results
-    if (!KEYS_ONLY)
-    {
-        printf("\tValues: ");
-        int compare = CompareDeviceResults(h_reference_values, d_values, TILE_SIZE, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-    printf("\n");
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-    printf("\n");
-
-    // Cleanup
-    if (h_keys)             delete[] h_keys;
-    if (h_reference_keys)   delete[] h_reference_keys;
-    if (h_values)           delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-    if (d_keys)             CubDebugExit(g_allocator.DeviceFree(d_keys));
-    if (d_values)           CubDebugExit(g_allocator.DeviceFree(d_values));
-    if (d_elapsed)          CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test driver (valid tile size <= MAX_SMEM_BYTES)
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestValid(Int2Type<true> fits_smem_capacity)
-{
-    // Iterate begin_bit
-    for (int begin_bit = 0; begin_bit <= 1; begin_bit++)
-    {
-        // Iterate end bit
-        for (int end_bit = begin_bit + 1; end_bit <= sizeof(Key) * 8; end_bit = end_bit * 2 + begin_bit)
-        {
-            // Uniform key distribution
-            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                UNIFORM, 0, begin_bit, end_bit);
-
-            // Sequential key distribution
-            TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                INTEGER_SEED, 0, begin_bit, end_bit);
-
-            // Iterate random with entropy_reduction
-            for (int entropy_reduction = 0; entropy_reduction <= 9; entropy_reduction += 3)
-            {
-                TestDriver<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, DESCENDING, BLOCKED_OUTPUT, Key, Value>(
-                    RANDOM, entropy_reduction, begin_bit, end_bit);
-            }
-        }
-    }
-}
-
-
-/**
- * Test driver (invalid tile size)
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    bool                    DESCENDING,
-    bool                    BLOCKED_OUTPUT,
-    typename                Key,
-    typename                Value>
-void TestValid(Int2Type<false> fits_smem_capacity)
-{}
-
-
-/**
- * Test ascending/descending and to-blocked/to-striped
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    cudaSharedMemConfig     SMEM_CONFIG,
-    typename                Key,
-    typename                Value>
-void Test()
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockRadixSort<Key, BLOCK_THREADS, ITEMS_PER_THREAD, Value, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixSortT;
-
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    Int2Type<sizeof(typename BlockRadixSortT::TempStorage) <= 16 * 1024> fits_smem_capacity;
-#else
-    Int2Type<(sizeof(typename BlockRadixSortT::TempStorage) <= 48 * 1024)> fits_smem_capacity;
-#endif
-
-    // Sort-ascending, to-striped
-    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, false, Key, Value>(fits_smem_capacity);
-
-    // Sort-descending, to-blocked
-    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, true, Key, Value>(fits_smem_capacity);
-
-    // Not necessary
-//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, false, false, Key, Value>(fits_smem_capacity);
-//    TestValid<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG, true, true, Key, Value>(fits_smem_capacity);
-}
-
-
-/**
- * Test value type and smem config
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    typename                Key>
-void TestKeys()
-{
-    // Test keys-only sorting with both smem configs
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, NullType>();    // Keys-only (4-byte smem bank config)
-#if !defined(SM100) && !defined(SM110) && !defined(SM130) && !defined(SM200)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeEightByte, Key, NullType>();   // Keys-only (8-byte smem bank config)
-#endif
-}
-
-
-/**
- * Test value type and smem config
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM,
-    typename                Key>
-void TestKeysAndPairs()
-{
-    // Test pairs sorting with only 4-byte configs
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, char>();        // With small-values
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, Key>();         // With same-values
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, cudaSharedMemBankSizeFourByte, Key, TestFoo>();     // With large values
-}
-
-
-/**
- * Test key type
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM>
-void Test()
-{
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef TEST_KEYS_ONLY
-
-    // Test unsigned types with keys-only
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned char>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned short>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned int>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long>();
-    TestKeys<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, unsigned long long>();
-
-#else
-
-    // Test signed and fp types with paired values
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, char>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, short>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, int>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, long long>();
-    TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, float>();
-    if (ptx_version > 120)
-    {
-        // Don't check doubles on PTX120 or below because they're down-converted
-        TestKeysAndPairs<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, double>();
-    }
-
-#endif
-}
-
-
-/**
- * Test inner scan algorithm
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_RAKING>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, MEMOIZE_OUTER_SCAN, BLOCK_SCAN_WARP_SCANS>();
-}
-
-
-/**
- * Test outer scan algorithm
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     RADIX_BITS>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, true>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, RADIX_BITS, false>();
-}
-
-
-/**
- * Test radix bits
- */
-template <
-    int BLOCK_THREADS,
-    int ITEMS_PER_THREAD>
-void Test()
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, 5>();
-}
-
-
-/**
- * Test items per thread
- */
-template <int BLOCK_THREADS>
-void Test()
-{
-    Test<BLOCK_THREADS, 1>();
-#if defined(SM100) || defined(SM110) || defined(SM130)
-    // Open64 compiler can't handle the number of test cases
-#else
-    Test<BLOCK_THREADS, 4>();
-#endif
-    Test<BLOCK_THREADS, 11>();
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    {
-        typedef float T;
-        TestDriver<32, 4, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(INTEGER_SEED, 0, 0, sizeof(T) * 8);
-    }
-/*
-    // Compile/run quick tests
-    typedef unsigned int T;
-    TestDriver<64, 17, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-    TestDriver<96, 8, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-    TestDriver<128, 2, 4, true, BLOCK_SCAN_WARP_SCANS, cudaSharedMemBankSizeFourByte, false, false, T, NullType>(RANDOM, 0, 0, sizeof(T) * 8);
-*/
-
-#else
-
-    // Compile/run thorough tests
-    Test<32>();
-    Test<64>();
-    Test<160>();
-
-
-#endif  // QUICK_TEST
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_block_reduce.cu b/external/cub/test/test_block_reduce.cu
deleted file mode 100644
index 23261582c16..00000000000
--- a/external/cub/test/test_block_reduce.cu
+++ /dev/null
@@ -1,822 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <device_functions.h>
-#include <typeinfo>
-
-#include <cub/block/block_reduce.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/util_ptx.cuh>
-#include <cub/util_allocator.cuh>
-#include <cub/util_debug.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-
-/// Generic reduction (full, 1)
-template <typename BlockReduceT, typename T, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[1], ReductionOp &reduction_op)
-{
-    return block_reduce.Reduce(data[0], reduction_op);
-}
-
-/// Generic reduction (full, ITEMS_PER_THREAD)
-template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], ReductionOp &reduction_op)
-{
-    return block_reduce.Reduce(data, reduction_op);
-}
-
-/// Generic reduction (partial, 1)
-template <typename BlockReduceT, typename T, typename ReductionOp>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T &data, ReductionOp &reduction_op, int valid_threads)
-{
-    return block_reduce.Reduce(data, reduction_op, valid_threads);
-}
-
-/// Sum reduction (full, 1)
-template <typename BlockReduceT, typename T>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[1], Sum &reduction_op)
-{
-    return block_reduce.Sum(data[0]);
-}
-
-/// Sum reduction (full, ITEMS_PER_THREAD)
-template <typename BlockReduceT, typename T, int ITEMS_PER_THREAD>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T (&data)[ITEMS_PER_THREAD], Sum &reduction_op)
-{
-    return block_reduce.Sum(data);
-}
-
-/// Sum reduction (partial, 1)
-template <typename BlockReduceT, typename T>
-__device__ __forceinline__ T DeviceTest(
-    BlockReduceT &block_reduce, T &data, Sum &reduction_op, int valid_threads)
-{
-    return block_reduce.Sum(data, valid_threads);
-}
-
-
-/**
- * Test full-tile reduction kernel (where num_items is an even
- * multiple of BLOCK_THREADS)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void FullTileReduceKernel(
-    T                       *d_in,
-    T                       *d_out,
-    ReductionOp             reduction_op,
-    int                     tiles,
-    clock_t                 *d_elapsed)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Cooperative thread block reduction utility type (returns aggregate in thread 0)
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-
-    // Load first tile of data
-    int block_offset = 0;
-
-    if (block_offset < TILE_SIZE * tiles)
-    {
-        LoadDirectBlocked(linear_tid, d_in + block_offset, data);
-        block_offset += TILE_SIZE;
-
-        // Start cycle timer
-        clock_t start = clock();
-
-        // Cooperative reduce first tile
-        BlockReduceT block_reduce(temp_storage) ;
-        T block_aggregate = DeviceTest(block_reduce, data, reduction_op);
-
-        // Stop cycle timer
- #if CUB_PTX_ARCH == 100
-        // Bug: recording stop clock causes mis-write of running prefix value
-        clock_t stop = 0;
-#else
-        clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-        clock_t elapsed = (start > stop) ? start - stop : stop - start;
-
-        // Loop over input tiles
-        while (block_offset < TILE_SIZE * tiles)
-        {
-            // TestBarrier between thread block reductions
-            __syncthreads();
-    
-            // Load tile of data
-            LoadDirectBlocked(linear_tid, d_in + block_offset, data);
-            block_offset += TILE_SIZE;
-
-            // Start cycle timer
-            clock_t start = clock();
-
-            // Cooperatively reduce the tile's aggregate
-            BlockReduceT block_reduce(temp_storage) ;
-            T tile_aggregate = DeviceTest(block_reduce, data, reduction_op);
-
-            // Stop cycle timer
-#if CUB_PTX_ARCH == 100
-            // Bug: recording stop clock causes mis-write of running prefix value
-            clock_t stop = 0;
-#else
-            clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-            elapsed += (start > stop) ? start - stop : stop - start;
-
-            // Reduce thread block aggregate
-            block_aggregate = reduction_op(block_aggregate, tile_aggregate);
-        }
-
-        // Store data
-        if (linear_tid == 0)
-        {
-            d_out[0] = block_aggregate;
-            *d_elapsed = elapsed;
-        }
-    }
-}
-
-
-
-/**
- * Test partial-tile reduction kernel (where num_items < BLOCK_THREADS)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void PartialTileReduceKernel(
-    T                       *d_in,
-    T                       *d_out,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    clock_t                 *d_elapsed)
-{
-    // Cooperative thread block reduction utility type (returns aggregate only in thread-0)
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockReduceT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockReduceT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T partial;
-
-    // Load partial tile data
-    if (linear_tid < num_items)
-    {
-        partial = d_in[linear_tid];
-    }
-
-    // Start cycle timer
-    clock_t start = clock();
-
-    // Cooperatively reduce the tile's aggregate
-    BlockReduceT block_reduce(temp_storage) ;
-    T tile_aggregate = DeviceTest(block_reduce, partial, reduction_op, num_items);
-
-    // Stop cycle timer
-#if CUB_PTX_ARCH == 100
-    // Bug: recording stop clock causes mis-write of running prefix value
-    clock_t stop = 0;
-#else
-    clock_t stop = clock();
-#endif // CUB_PTX_ARCH == 100
-
-    clock_t elapsed = (start > stop) ? start - stop : stop - start;
-
-    // Store data
-    if (linear_tid == 0)
-    {
-        d_out[0] = tile_aggregate;
-        *d_elapsed = elapsed;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem (and solution)
- */
-template <
-    typename    T,
-    typename    ReductionOp>
-void Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           h_reference[1],
-    ReductionOp reduction_op,
-    int         num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        if (i == 0)
-            h_reference[0] = h_in[0];
-        else
-            h_reference[0] = reduction_op(h_reference[0], h_in[i]);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n");
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Full tile test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Test full-tile reduction.  (Specialized for sufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op,
-    Int2Type<true>          sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    int num_items = TILE_SIZE * tiles;
-
-    // Allocate host arrays
-    T *h_in = new T[num_items];
-    T h_reference[1];
-
-    // Initialize problem
-    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
-
-    // Test multi-tile (unguarded)
-    printf("TestFullTile %s, %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), ITEMS_PER_THREAD(%d), tiles(%d), %s (%d bytes) elements:\n",
-        Equals<ReductionOp, Sum>::VALUE ? "Sum" : "Max",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        gen_mode,
-        num_items,
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        ITEMS_PER_THREAD,
-        tiles,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    FullTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        reduction_op,
-        tiles,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test full-tile reduction.  (Specialized for insufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op,
-    Int2Type<false>         sufficient_resources)
-{}
-
-
-/**
- * Test full-tile reduction.
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
-
-    enum 
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 16 * 1024),
-        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512),
-#else
-        sufficient_smem       = (sizeof(typename BlockReduceT::TempStorage) <= 48 * 1024),
-        sufficient_threads    = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024),
-#endif
-    };
-
-    TestFullTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
-}
-
-
-/**
- * Run battery of tests for different thread block dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, T>(gen_mode, tiles, reduction_op);
-}
-
-/**
- * Run battery of tests for different thread items
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    int                     tiles,
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 1, T>(gen_mode, tiles, reduction_op);
-    TestFullTile<ALGORITHM, BLOCK_THREADS, 4, T>(gen_mode, tiles, reduction_op);
-}
-
-
-/**
- * Run battery of full-tile tests for different numbers of tiles
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestFullTile(
-    GenMode                 gen_mode,
-    ReductionOp             reduction_op)
-{
-    for (int tiles = 1; tiles < 3; tiles++)
-    {
-        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(gen_mode, tiles, reduction_op);
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Partial-tile test generation
-//---------------------------------------------------------------------
-
-/**
- * Test partial-tile reduction.  (Specialized for sufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    Int2Type<true>          sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS;
-
-    // Allocate host arrays
-    T *h_in = new T[num_items];
-    T h_reference[1];
-
-    // Initialize problem
-    Initialize(gen_mode, h_in, h_reference, reduction_op, num_items);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * 1));
-
-    printf("TestPartialTile %s, gen-mode %d, num_items(%d), BLOCK_THREADS(%d) (%d,%d,%d), %s (%d bytes) elements:\n",
-        (ALGORITHM == BLOCK_REDUCE_RAKING) ? "BLOCK_REDUCE_RAKING" : (ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY) ? "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY" : "BLOCK_REDUCE_WARP_REDUCTIONS",
-        gen_mode,
-        num_items,
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    PartialTileReduceKernel<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        num_items,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, 1, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-
-/**
- * Test partial-tile reduction (specialized for insufficient resources)
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op,
-    Int2Type<false>         sufficient_resources)
-{}
-
-
-/**
- *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_DIM_X,
-    int                     BLOCK_DIM_Y,
-    int                     BLOCK_DIM_Z,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    int                     num_items,
-    ReductionOp             reduction_op)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockReduce<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z, TEST_ARCH> BlockReduceT;
-
-    enum 
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 16 * 1024,
-        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 512,
-#else
-        sufficient_smem       = sizeof(typename BlockReduceT::TempStorage)  <= 48 * 1024,
-        sufficient_threads    = (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)   <= 1024,
-#endif
-    };
-
-    TestPartialTile<ALGORITHM, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, T>(gen_mode, num_items, reduction_op, Int2Type<sufficient_smem && sufficient_threads>());
-}
-
-
-
-/**
- *  Run battery of partial-tile tests for different numbers of effective threads and thread dimensions
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void TestPartialTile(
-    GenMode                 gen_mode,
-    ReductionOp             reduction_op)
-{
-    for (
-        int num_items = 1;
-        num_items < BLOCK_THREADS;
-        num_items += CUB_MAX(1, BLOCK_THREADS / 5))
-    {
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, 1, 1, T>(gen_mode, num_items, reduction_op);
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, 2, 2, T>(gen_mode, num_items, reduction_op);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Run battery of full-tile tests for different gen modes
- */
-template <
-    BlockReduceAlgorithm    ALGORITHM,
-    int                     BLOCK_THREADS,
-    typename                T,
-    typename                ReductionOp>
-void Test(
-    ReductionOp             reduction_op)
-{
-    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
-    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(UNIFORM, reduction_op);
-
-    TestFullTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
-    TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(INTEGER_SEED, reduction_op);
-
-    if (Traits<T>::CATEGORY != FLOATING_POINT)
-    {
-        // Don't test randomly-generated floats b/c of stability
-        TestFullTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
-        TestPartialTile<ALGORITHM, BLOCK_THREADS, T>(RANDOM, reduction_op);
-    }
-}
-
-
-/**
- * Run battery of tests for different block-reduction algorithmic variants
- */
-template <
-    int             BLOCK_THREADS,
-    typename        T,
-    typename        ReductionOp>
-void Test(
-    ReductionOp     reduction_op)
-{
-#ifdef TEST_RAKING
-    Test<BLOCK_REDUCE_RAKING, BLOCK_THREADS, T>(reduction_op);
-    Test<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, BLOCK_THREADS, T>(reduction_op);
-#endif
-#ifdef TEST_WARP_REDUCTIONS
-    Test<BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_THREADS, T>(reduction_op);
-#endif
-}
-
-
-/**
- * Run battery of tests for different block sizes
- */
-template <
-    typename        T,
-    typename        ReductionOp>
-void Test(
-    ReductionOp     reduction_op)
-{
-    Test<7,   T>(reduction_op);
-    Test<32,  T>(reduction_op);
-    Test<63,  T>(reduction_op);
-    Test<97,  T>(reduction_op);
-    Test<128, T>(reduction_op);
-    Test<238, T>(reduction_op);
-}
-
-
-/**
- * Run battery of tests for different block sizes
- */
-template <typename T>
-void Test()
-{
-    Test<T>(Sum());
-    Test<T>(Max());
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-
-
-    printf("\n full tile ------------------------\n\n");
-
-    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 4, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 4, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 4, int>(RANDOM, 1, Sum());
-
-    TestFullTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, 1, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, 1, int>(RANDOM, 1, Sum());
-    TestFullTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, 1, int>(RANDOM, 1, Sum());
-
-    printf("\n partial tile ------------------------\n\n");
-
-    TestPartialTile<BLOCK_REDUCE_RAKING,                   128, 1, 1, int>(RANDOM, 7, Sum());
-    TestPartialTile<BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,  128, 1, 1, int>(RANDOM, 7, Sum());
-    TestPartialTile<BLOCK_REDUCE_WARP_REDUCTIONS,          128, 1, 1, int>(RANDOM, 7, Sum());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // primitives
-        Test<char>();
-        Test<short>();
-        Test<int>();
-        Test<long long>();
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            Test<double>();
-
-        Test<float>();
-
-        // vector types
-        Test<char2>();
-        Test<short2>();
-        Test<int2>();
-        Test<longlong2>();
-
-        Test<char4>();
-        Test<short4>();
-        Test<int4>();
-        Test<longlong4>();
-
-        // Complex types
-        Test<TestFoo>();
-        Test<TestBar>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
diff --git a/external/cub/test/test_block_scan.cu b/external/cub/test/test_block_scan.cu
deleted file mode 100644
index 033c89ee094..00000000000
--- a/external/cub/test/test_block_scan.cu
+++ /dev/null
@@ -1,929 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of BlockScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <iostream>
-#include <limits>
-#include <typeinfo>
-
-#include <cub/block/block_scan.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_ptx.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * Primitive variant to test
- */
-enum TestMode
-{
-    BASIC,
-    AGGREGATE,
-    PREFIX,
-};
-
-
-/**
- * Scan mode to test
- */
-enum ScanMode
-{
-    EXCLUSIVE,
-    INCLUSIVE
-};
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-
-/**
- * Stateful prefix functor
- */
-template <
-    typename T,
-    typename ScanOpT>
-struct BlockPrefixCallbackOp
-{
-    int     linear_tid;
-    T       prefix;
-    ScanOpT  scan_op;
-
-    __device__ __forceinline__
-    BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) :
-        linear_tid(linear_tid),
-        prefix(prefix),
-        scan_op(scan_op)
-    {}
-
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-        // For testing purposes
-        T retval = (linear_tid == 0) ? prefix  : T();
-        prefix = scan_op(prefix, block_aggregate);
-        return retval;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Exclusive scan
-//---------------------------------------------------------------------
-
-/// Exclusive scan (BASIC, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op);
-}
-
-/// Exclusive scan (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, initial_value, scan_op);
-}
-
-/// Exclusive scan (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate);
-}
-
-/// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate);
-}
-
-/// Exclusive scan (PREFIX, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op);
-}
-
-/// Exclusive scan (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.ExclusiveScan(data, data, scan_op, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Exclusive sum
-//---------------------------------------------------------------------
-
-/// Exclusive sum (BASIC, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0]);
-}
-
-/// Exclusive sum (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data);
-}
-
-/// Exclusive sum (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0], block_aggregate);
-}
-
-/// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data, block_aggregate);
-}
-
-/// Exclusive sum (PREFIX, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data[0], data[0], prefix_op);
-}
-
-/// Exclusive sum (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<EXCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.ExclusiveSum(data, data, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Inclusive scan
-//---------------------------------------------------------------------
-
-/// Inclusive scan (BASIC, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op);
-}
-
-/// Inclusive scan (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op);
-}
-
-/// Inclusive scan (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate);
-}
-
-/// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op, block_aggregate);
-}
-
-/// Inclusive scan (PREFIX, 1)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op);
-}
-
-/// Inclusive scan (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename ScanOpT, typename PrefixCallbackOp, int ITEMS_PER_THREAD, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, IsPrimitiveT is_primitive)
-{
-    block_scan.InclusiveScan(data, data, scan_op, prefix_op);
-}
-
-
-//---------------------------------------------------------------------
-// Inclusive sum
-//---------------------------------------------------------------------
-
-/// Inclusive sum (BASIC, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0]);
-}
-
-/// Inclusive sum (BASIC, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<BASIC> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data);
-}
-
-/// Inclusive sum (AGGREGATE, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0], block_aggregate);
-}
-
-/// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<AGGREGATE> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data, block_aggregate);
-}
-
-/// Inclusive sum (PREFIX, 1)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data[0], data[0], prefix_op);
-}
-
-/// Inclusive sum (PREFIX, ITEMS_PER_THREAD)
-template <typename BlockScanT, typename T, typename PrefixCallbackOp, int ITEMS_PER_THREAD>
-__device__ __forceinline__ void DeviceTest(
-    BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op,
-    Int2Type<INCLUSIVE> scan_mode, Int2Type<PREFIX> test_mode, Int2Type<true> is_primitive)
-{
-    block_scan.InclusiveSum(data, data, prefix_op);
-}
-
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * BlockScan test kernel.
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            T,
-    typename            ScanOpT>
-__launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)
-__global__ void BlockScanKernel(
-    T                   *d_in,
-    T                   *d_out,
-    T                   *d_aggregate,
-    ScanOpT              scan_op,
-    T                   initial_value,
-    clock_t             *d_elapsed)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Parameterize BlockScan type for our thread block
-    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename BlockScanT::TempStorage temp_storage;
-
-    int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-
-    // Per-thread tile data
-    T data[ITEMS_PER_THREAD];
-    LoadDirectBlocked(linear_tid, d_in, data);
-
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test scan
-    T                                   block_aggregate;
-    BlockScanT                          block_scan(temp_storage);
-    BlockPrefixCallbackOp<T, ScanOpT>   prefix_op(linear_tid, initial_value, scan_op);
-
-    DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op,
-        Int2Type<SCAN_MODE>(), Int2Type<TEST_MODE>(), Int2Type<Traits<T>::PRIMITIVE>());
-
-    // Stop cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Store output
-    StoreDirectBlocked(linear_tid, d_out, data);
-
-    // Store block_aggregate
-    if (TEST_MODE != BASIC)
-        d_aggregate[linear_tid] = block_aggregate;
-
-    // Store prefix
-    if (TEST_MODE == PREFIX)
-    {
-        if (linear_tid == 0)
-            d_out[TILE_SIZE] = prefix_op.prefix;
-    }
-
-    // Store time
-    if (linear_tid == 0)
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-}
-
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive-scan problem (and solution)
- */
-template <typename T, typename ScanOpT>
-T Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         num_items,
-    ScanOpT     scan_op,
-    T           initial_value,
-    Int2Type<EXCLUSIVE>)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    h_reference[0]      = initial_value;
-    T inclusive         = scan_op(initial_value, h_in[0]);
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        h_reference[i] = inclusive;
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Initialize inclusive-scan problem (and solution)
- */
-template <typename T, typename ScanOpT>
-T Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         num_items,
-    ScanOpT      scan_op,
-    T           initial_value,
-    Int2Type<INCLUSIVE>)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    T inclusive         = scan_op(initial_value, h_in[0]);
-    h_reference[0]      = inclusive;
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-        h_reference[i] = inclusive;
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Test thread block scan.  (Specialized for sufficient resources)
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value,
-    Int2Type<true>      sufficient_resources)
-{
-    const int BLOCK_THREADS     = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z;
-    const int TILE_SIZE         = BLOCK_THREADS * ITEMS_PER_THREAD;
-
-    // Allocate host arrays
-    T *h_in = new T[TILE_SIZE];
-    T *h_reference = new T[TILE_SIZE];
-    T *h_aggregate = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    T block_aggregate = Initialize(
-        gen_mode,
-        h_in,
-        h_reference,
-        TILE_SIZE,
-        scan_op,
-        initial_value,
-        Int2Type<SCAN_MODE>());
-
-    // Test reference block_aggregate is returned in all threads
-    for (int i = 0; i < BLOCK_THREADS; ++i)
-    {
-        h_aggregate[i] = block_aggregate;
-    }
-
-    // Run kernel
-    printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) thread block threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n",
-        TEST_MODE, gen_mode, ALGORITHM,
-        (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(),
-        BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z,
-        ITEMS_PER_THREAD,  TILE_SIZE,
-        typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Initialize/clear device arrays
-    T       *d_in = NULL;
-    T       *d_out = NULL;
-    T       *d_aggregate = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1)));
-    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS));
-
-    // Display input problem data
-    if (g_verbose)
-    {
-        printf("Input data: ");
-        for (int i = 0; i < TILE_SIZE; i++)
-        {
-            std::cout << CoutCast(h_in[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Run block_aggregate/prefix kernel
-    dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z);
-    BlockScanKernel<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM><<<1, block_dims>>>(
-        d_in,
-        d_out,
-        d_aggregate,
-        scan_op,
-        initial_value,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tScan results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (TEST_MODE == AGGREGATE)
-    {
-        // Copy out and display block_aggregate
-        printf("\tScan block aggregate: ");
-        compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    if (TEST_MODE == PREFIX)
-    {
-        // Copy out and display updated prefix
-        printf("\tScan running total: ");
-        T running_total = scan_op(initial_value, block_aggregate);
-        compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_aggregate) delete[] h_aggregate;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test thread block scan.  (Specialized for insufficient resources)
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value,
-    Int2Type<false>     sufficient_resources)
-{}
-
-
-/**
- * Test thread block scan.
- */
-template <
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y,
-    int                 BLOCK_DIM_Z,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode             gen_mode,
-    ScanOpT             scan_op,
-    T                   initial_value)
-{
-    // Check size of smem storage for the target arch to make sure it will fit
-    typedef BlockScan<T, BLOCK_DIM_X, ALGORITHM, BLOCK_DIM_Y, BLOCK_DIM_Z> BlockScanT;
-
-    enum
-    {
-#if defined(SM100) || defined(SM110) || defined(SM130)
-        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
-        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 512),
-#else
-        sufficient_smem         = (sizeof(typename BlockScanT::TempStorage)     <= 16 * 1024),
-        sufficient_threads      = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z)    <= 1024),
-#endif
-
-#if defined(_WIN32) || defined(_WIN64)
-        // Accommodate ptxas crash bug (access violation) on Windows
-        special_skip            = ((TEST_ARCH <= 130) && (Equals<T, TestBar>::VALUE) && (BLOCK_DIM_Z > 1)),
-#else
-        special_skip            = false,
-#endif
-        sufficient_resources    = (sufficient_smem && sufficient_threads && !special_skip),
-    };
-
-    Test<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(
-        gen_mode, scan_op, initial_value, Int2Type<sufficient_resources>());
-}
-
-
-
-/**
- * Run test for different thread block dimensions
- */
-template <
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    ScanMode            SCAN_MODE,
-    TestMode            TEST_MODE,
-    BlockScanAlgorithm  ALGORITHM,
-    typename            ScanOpT,
-    typename            T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-    Test<BLOCK_THREADS, 1, 1, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
-    Test<BLOCK_THREADS, 2, 2, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, ALGORITHM>(gen_mode, scan_op, initial_value);
-}
-
-
-/**
- * Run test for different policy types
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    ScanMode    SCAN_MODE,
-    TestMode    TEST_MODE,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-#ifdef TEST_RAKING
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING>(gen_mode, scan_op, initial_value);
-#endif
-#ifdef TEST_RAKING_MEMOIZE
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_RAKING_MEMOIZE>(gen_mode, scan_op, initial_value);
-#endif
-#ifdef TEST_WARP_SCANS
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, SCAN_MODE, TEST_MODE, BLOCK_SCAN_WARP_SCANS>(gen_mode, scan_op, initial_value);
-#endif
-}
-
-
-/**
- * Run tests for different primitive variants
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           identity,
-    T           initial_value)
-{
-    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, scan_op, identity);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, scan_op, identity);
-
-    // Exclusive (non-specialized, so we can use initial-value)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, BASIC>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, AGGREGATE>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, EXCLUSIVE, PREFIX>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-
-    // Inclusive
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, BASIC>(gen_mode, scan_op, identity);      // This scan doesn't take an initial value
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, AGGREGATE>(gen_mode, scan_op, identity);  // This scan doesn't take an initial value
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD, INCLUSIVE, PREFIX>(gen_mode, scan_op, initial_value);
-}
-
-
-/**
- * Run tests for different problem-generation options
- */
-template <
-    int         BLOCK_THREADS,
-    int         ITEMS_PER_THREAD,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    ScanOpT     scan_op,
-    T           identity,
-    T           initial_value)
-{
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(UNIFORM, scan_op, identity, initial_value);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(INTEGER_SEED, scan_op, identity, initial_value);
-
-    // Don't test randomly-generated floats b/c of stability
-    if (Traits<T>::CATEGORY != FLOATING_POINT)
-        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(RANDOM, scan_op, identity, initial_value);
-}
-
-
-/**
- * Run tests for different data types and scan ops
- */
-template <
-    int BLOCK_THREADS,
-    int ITEMS_PER_THREAD>
-void Test()
-{
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // primitive
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned char) 0, (unsigned char) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned short) 0, (unsigned short) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned int) 0, (unsigned int) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (unsigned long long) 0, (unsigned long long) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), (float) 0, (float) 99);
-
-    // primitive (alternative scan op)
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<char>::min(), (char) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<short>::min(), (short) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<int>::min(), (int) 99);
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<long long>::min(), (long long) 99);
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Max(), std::numeric_limits<double>::max() * -1, (double) 99);
-
-    // vec-1
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar1(0), make_uchar1(17));
-
-    // vec-2
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uchar2(0, 0), make_uchar2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ushort2(0, 0), make_ushort2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_uint2(0, 0), make_uint2(17, 21));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21));
-
-    // vec-4
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85));
-
-    // complex
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<BLOCK_THREADS, ITEMS_PER_THREAD>(Sum(), TestBar(0, 0), TestBar(17, 21));
-
-}
-
-
-/**
- * Run tests for different items per thread
- */
-template <int BLOCK_THREADS>
-void Test()
-{
-    Test<BLOCK_THREADS, 1>();
-    Test<BLOCK_THREADS, 2>();
-    Test<BLOCK_THREADS, 9>();
-}
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
-
-    // Compile/run quick tests
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0));
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0));
-    Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0));
-
-    Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85));
-
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Run tests for different thread block sizes
-        Test<17>();
-        Test<32>();
-        Test<62>();
-        Test<65>();
-//            Test<96>();             // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21));
-        Test<128>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/external/cub/test/test_device_histogram.cu b/external/cub/test/test_device_histogram.cu
deleted file mode 100644
index b77b7391041..00000000000
--- a/external/cub/test/test_device_histogram.cu
+++ /dev/null
@@ -1,1669 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceHistogram utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <algorithm>
-#include <typeinfo>
-
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-    #include <npp.h>
-#endif
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_histogram.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    NPP,        // NPP method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-bool                    g_verbose_input     = false;
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-
-
-//---------------------------------------------------------------------
-// Dispatch to NPP histogram
-//---------------------------------------------------------------------
-
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-
-/**
- * Dispatch to single-channel 8b NPP histo-even
- */
-template <typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<1>             num_channels,
-    Int2Type<1>             num_active_channels,
-    Int2Type<NPP>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[1],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[1],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[1],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[1],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef unsigned char SampleT;
-
-    cudaError_t error = cudaSuccess;
-    NppiSize oSizeROI = {
-        num_row_pixels,
-        num_rows
-    };
-
-    if (d_temp_storage_bytes == NULL)
-    {
-        int nDeviceBufferSize;
-        nppiHistogramEvenGetBufferSize_8u_C1R(oSizeROI, num_levels[0] ,&nDeviceBufferSize);
-        temp_storage_bytes = nDeviceBufferSize;
-    }
-    else
-    {
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            // compute the histogram
-            nppiHistogramEven_8u_C1R(
-                d_samples,
-                row_stride_bytes,
-                oSizeROI,
-                d_histogram[0],
-                num_levels[0],
-                lower_level[0],
-                upper_level[0],
-                (Npp8u*) d_temp_storage);
-        }
-    }
-
-    return error;
-}
-
-
-/**
- * Dispatch to 3/4 8b NPP histo-even
- */
-template <typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<4>          num_channels,
-    Int2Type<3>   num_active_channels,
-    Int2Type<NPP>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    unsigned char       *d_samples,               ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[3],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[3],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[3],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[3],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,           ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                 ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,         ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef unsigned char SampleT;
-
-    cudaError_t error = cudaSuccess;
-    NppiSize oSizeROI = {
-        num_row_pixels,
-        num_rows
-    };
-
-    if (d_temp_storage_bytes == NULL)
-    {
-        int nDeviceBufferSize;
-        nppiHistogramEvenGetBufferSize_8u_AC4R(oSizeROI, num_levels ,&nDeviceBufferSize);
-        temp_storage_bytes = nDeviceBufferSize;
-    }
-    else
-    {
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            // compute the histogram
-            nppiHistogramEven_8u_AC4R(
-                d_samples,
-                row_stride_bytes,
-                oSizeROI,
-                d_histogram,
-                num_levels,
-                lower_level,
-                upper_level,
-                (Npp8u*) d_temp_storage);
-        }
-    }
-
-    return error;
-}
-
-
-#endif // #if defined(QUICK_TEST) || defined(QUICKER_TEST)
-
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceHistogram entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to CUB single histogram-even entrypoint
- */
-template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<1>             num_channels,
-    Int2Type<1>             num_active_channels,
-    Int2Type<CUB>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[1],                             ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[1],                             ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceHistogram::HistogramEven(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram[0],
-            num_levels[0],
-            lower_level[0],
-            upper_level[0],
-            num_row_pixels,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to CUB multi histogram-even entrypoint
- */
-template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchEven(
-    Int2Type<NUM_CHANNELS>          num_channels,
-    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
-    Int2Type<CUB>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            lower_level,
-            upper_level,
-            num_row_pixels,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to CUB single histogram-range entrypoint
- */
-template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchRange(
-    Int2Type<1>             num_channels,
-    Int2Type<1>             num_active_channels,
-    Int2Type<CUB>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[1],                            ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[1],                              ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              *d_levels[1],                               ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceHistogram::HistogramRange(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram[0],
-            num_levels[0],
-            d_levels[0],
-            num_row_pixels,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to CUB multi histogram-range entrypoint
- */
-template <int NUM_ACTIVE_CHANNELS, int NUM_CHANNELS, typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t DispatchRange(
-    Int2Type<NUM_CHANNELS>          num_channels,
-    Int2Type<NUM_ACTIVE_CHANNELS>   num_active_channels,
-    Int2Type<CUB>           dispatch_to,
-    int                     timing_timing_iterations,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-    CounterT            *d_histogram[NUM_ACTIVE_CHANNELS],          ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-    OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT             row_stride_bytes,                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            d_levels,
-            num_row_pixels,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA nested-parallelism test kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceHistogram
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-__global__ void CnpDispatchKernel(
-    Int2Type<ALGORITHM> algorithm,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_out_histograms,
-    int                 num_samples,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(algorithm, Int2Type<false>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_out_histograms.array, num_samples, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/ **
- * Dispatch to CDP kernel
- * /
-template <int BINS, int NUM_CHANNELS, int NUM_ACTIVE_CHANNELS, typename SampleT, typename SampleIteratorT, typename CounterT, int ALGORITHM>
-cudaError_t Dispatch(
-    Int2Type<ALGORITHM> algorithm,
-    Int2Type<true>      use_cdp,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    SampleT             *d_samples,
-    SampleIteratorT      d_sample_itr,
-    CounterT        *d_histograms[NUM_ACTIVE_CHANNELS],
-    int                 num_samples,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_histo_wrapper;
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, ALGORITHM><<<1,1>>>(algorithm, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_samples, d_sample_itr, d_histo_wrapper, num_samples, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-*/
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-// Searches for bin given a list of bin-boundary levels
-template <typename LevelT>
-struct SearchTransform
-{
-    LevelT          *levels;      // Pointer to levels array
-    int             num_levels;   // Number of levels in array
-
-    // Functor for converting samples to bin-ids (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        int bin = int(std::upper_bound(levels, levels + num_levels, (LevelT) sample) - levels - 1);
-        if (bin < 0)
-        {
-            // Sample out of range
-            return num_levels;
-        }
-        return bin;
-    }
-};
-
-
-// Scales samples to evenly-spaced bins
-template <typename LevelT>
-struct ScaleTransform
-{
-    int    num_levels;  // Number of levels in array
-    LevelT max;         // Max sample level (exclusive)
-    LevelT min;         // Min sample level (inclusive)
-    LevelT scale;       // Bin scaling factor
-
-    void Init(
-        int    num_levels,  // Number of levels in array
-        LevelT max,         // Max sample level (exclusive)
-        LevelT min,         // Min sample level (inclusive)
-        LevelT scale)       // Bin scaling factor
-    {
-        this->num_levels = num_levels;
-        this->max = max;
-        this->min = min;
-        this->scale = scale;
-    }
-
-    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        if ((sample < min) || (sample >= max))
-        {
-            // Sample out of range
-            return num_levels;
-        }
-
-        return (int) ((((LevelT) sample) - min) / scale);
-    }
-};
-
-// Scales samples to evenly-spaced bins
-template <>
-struct ScaleTransform<float>
-{
-    int   num_levels;  // Number of levels in array
-    float max;         // Max sample level (exclusive)
-    float min;         // Min sample level (inclusive)
-    float scale;       // Bin scaling factor
-
-    void Init(
-        int    num_levels,  // Number of levels in array
-        float max,         // Max sample level (exclusive)
-        float min,         // Min sample level (inclusive)
-        float scale)       // Bin scaling factor
-    {
-        this->num_levels = num_levels;
-        this->max = max;
-        this->min = min;
-        this->scale = 1.0f / scale;
-    }
-
-    // Functor for converting samples to bin-ids  (num_levels is returned if sample is out of range)
-    template <typename SampleT>
-    int operator()(SampleT sample)
-    {
-        if ((sample < min) || (sample >= max))
-        {
-            // Sample out of range
-            return num_levels;
-        }
-
-        return (int) ((((float) sample) - min) * scale);
-    }
-};
-
-
-/**
- * Generate sample
- */
-template <typename T, typename LevelT>
-void Sample(T &datum, LevelT max_level, int entropy_reduction)
-{
-    unsigned int max = (unsigned int) -1;
-    unsigned int bits;
-    RandomBits(bits, entropy_reduction);
-    float fraction = (float(bits) / max);
-
-    datum = (T) (fraction * max_level);
-}
-
-
-/**
- * Initialize histogram samples
- */
-template <
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        LevelT,
-    typename        SampleT,
-    typename        OffsetT>
-void InitializeSamples(
-    LevelT          max_level,
-    int             entropy_reduction,
-    SampleT         *h_samples,
-    OffsetT         num_row_pixels,         ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,               ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    // Initialize samples
-    for (OffsetT row = 0; row < num_rows; ++row)
-    {
-        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
-        {
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                // Sample offset
-                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
-
-                // Init sample value
-                Sample(h_samples[offset], max_level, entropy_reduction);
-                if (g_verbose_input)
-                {
-                    if (channel > 0) printf(", ");
-                    std::cout << CoutCast(h_samples[offset]);
-                }
-            }
-        }
-    }
-}
-
-
-/**
- * Initialize histogram solutions
- */
-template <
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        SampleIteratorT,
-    typename        TransformOp,
-    typename        OffsetT>
-void InitializeBins(
-    SampleIteratorT h_samples,
-    int             num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    TransformOp     transform_op[NUM_ACTIVE_CHANNELS],      ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    CounterT        *h_histogram[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-    OffsetT         num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                               ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    // Init bins
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        for (int bin = 0; bin < num_levels[CHANNEL] - 1; ++bin)
-        {
-            h_histogram[CHANNEL][bin] = 0;
-        }
-    }
-
-    // Initialize samples
-    if (g_verbose_input) printf("Samples: \n");
-    for (OffsetT row = 0; row < num_rows; ++row)
-    {
-        for (OffsetT pixel = 0; pixel < num_row_pixels; ++pixel)
-        {
-            if (g_verbose_input) printf("[");
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                // Sample offset
-                OffsetT offset = (row * (row_stride_bytes / sizeof(SampleT))) + (pixel * NUM_CHANNELS) + channel;
-
-                // Update sample bin
-                int bin = transform_op[channel](h_samples[offset]);
-                if (g_verbose_input) printf(" (%d)", bin); fflush(stdout);
-                if ((bin >= 0) && (bin < num_levels[channel] - 1))
-                {
-                    // valid bin
-                    h_histogram[channel][bin]++;
-                }
-            }
-            if (g_verbose_input) printf("]");
-        }
-        if (g_verbose_input) printf("\n\n");
-    }
-}
-
-
-
-/**
- * Test histogram-even
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT,
-    typename        SampleIteratorT>
-void TestEven(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-    SampleIteratorT h_samples,
-    SampleIteratorT d_samples)
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramEven (%s) %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
-        (IsPointer<SampleIteratorT>::VALUE) ? "pointer" : "iterator",
-        (int) (num_row_pixels * num_rows),
-        (int) num_rows,
-        (int) num_row_pixels,
-        (int) row_stride_bytes,
-        (int) total_samples,
-        (int) sizeof(SampleT),
-        typeid(SampleT).name(),
-        entropy_reduction,
-        typeid(CounterT).name(),
-        NUM_ACTIVE_CHANNELS,
-        NUM_CHANNELS);
-    std::cout << CoutCast(max_level) << "\n";
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-        std::cout << "\n\tChannel " << channel << ": " << num_levels[channel] - 1 << " bins [" << lower_level[channel] << ", " << upper_level[channel] << ")\n";
-    fflush(stdout);
-
-    // Allocate and initialize host and device data
-
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
-    ScaleTransform<LevelT>      transform_op[NUM_ACTIVE_CHANNELS];
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int bins = num_levels[channel] - 1;
-        h_histogram[channel] = new CounterT[bins];
-
-        transform_op[channel].Init(
-            num_levels[channel],
-            upper_level[channel],
-            lower_level[channel],
-            ((upper_level[channel] - lower_level[channel]) / bins));
-    }
-
-    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-
-    CounterT* d_histogram[NUM_ACTIVE_CHANNELS];
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel], sizeof(CounterT) * (num_levels[channel] - 1)));
-        CubDebugExit(cudaMemset(d_histogram[channel], 0, sizeof(CounterT) * (num_levels[channel] - 1)));
-    }
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-
-    DispatchEven(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Allocate temporary storage with "canary" zones
-    int     canary_bytes    = 256;
-    char    canary_token    = 8;
-    char*   canary_zone     = new char[canary_bytes];
-
-    memset(canary_zone, canary_token, canary_bytes);
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
-    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
-
-    // Run warmup/correctness iteration
-    DispatchEven(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Check canary zones
-    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
-        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
-        error |= channel_error;
-    }
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    DispatchEven(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, false);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(SampleT);
-        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
-            avg_millis,
-            giga_rate,
-            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
-            giga_rate / NUM_CHANNELS,
-            giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        if (h_histogram[channel])
-            delete[] h_histogram[channel];
-
-        if (d_histogram[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
-    }
-
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, error);
-}
-
-
-/**
- * Test histogram-even (native pointer input)
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEvenNative(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    // Allocate and initialize host sample data
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    SampleT*                    h_samples = new Foo[total_samples];
-
-    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-    SampleT* d_samples = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
-
-    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        h_samples, d_samples);
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-}
-
-
-/**
- * Test histogram-even (native pointer input)
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEvenIterator(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT          lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    LevelT          upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    SampleT sample = (SampleT) lower_level[0];
-    ConstantInputIterator<SampleT> sample_itr(sample);
-
-    TestEven<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level,
-        num_row_pixels, num_rows, row_stride_bytes,
-        sample_itr, sample_itr);
-
-}
-
-
-/**
- * Test histogram-range
- */
-template <
-    Backend         BACKEND,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestRange(
-    LevelT          max_level,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-    LevelT*         levels[NUM_ACTIVE_CHANNELS],                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-    OffsetT         num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-    OffsetT         num_rows,                                   ///< [in] The number of rows in the region of interest
-    OffsetT         row_stride_bytes)                                 ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-{
-    OffsetT total_samples = num_rows * (row_stride_bytes / sizeof(SampleT));
-
-    printf("\n----------------------------\n");
-    printf("%s cub::DeviceHistogramRange %d pixels (%d height, %d width, %d-byte row stride), %d %d-byte %s samples (entropy reduction %d), %s counters, %d/%d channels, max sample ",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == NPP) ? "NPP" : "CUB",
-        (int) (num_row_pixels * num_rows),
-        (int) num_rows,
-        (int) num_row_pixels,
-        (int) row_stride_bytes,
-        (int) total_samples,
-        (int) sizeof(SampleT),
-        typeid(SampleT).name(),
-        entropy_reduction,
-        typeid(CounterT).name(),
-        NUM_ACTIVE_CHANNELS,
-        NUM_CHANNELS);
-    std::cout << CoutCast(max_level) << "\n";
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        printf("Channel %d: %d bins [", channel, num_levels[channel] - 1);
-        std::cout << levels[channel][0];
-        for (int level = 1; level < num_levels[channel]; ++level)
-            std::cout << ", " << levels[channel][level];
-        printf("]\n");
-    }
-    fflush(stdout);
-
-    // Allocate and initialize host and device data
-    typedef SampleT Foo;        // rename type to quelch gcc warnings (bug?)
-    SampleT*                    h_samples = new Foo[total_samples];
-    CounterT*                   h_histogram[NUM_ACTIVE_CHANNELS];
-    SearchTransform<LevelT>     transform_op[NUM_ACTIVE_CHANNELS];
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        transform_op[channel].levels = levels[channel];
-        transform_op[channel].num_levels = num_levels[channel];
-
-        int bins = num_levels[channel] - 1;
-        h_histogram[channel] = new CounterT[bins];
-    }
-
-    InitializeSamples<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        max_level, entropy_reduction, h_samples, num_row_pixels, num_rows, row_stride_bytes);
-
-    InitializeBins<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-        h_samples, num_levels, transform_op, h_histogram, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Allocate and initialize device data
-    SampleT*        d_samples = NULL;
-    LevelT*         d_levels[NUM_ACTIVE_CHANNELS];
-    CounterT*       d_histogram[NUM_ACTIVE_CHANNELS];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_samples, sizeof(SampleT) * total_samples));
-    CubDebugExit(cudaMemcpy(d_samples, h_samples, sizeof(SampleT) * total_samples, cudaMemcpyHostToDevice));
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_levels[channel], sizeof(LevelT) * num_levels[channel]));
-        CubDebugExit(cudaMemcpy(d_levels[channel], levels[channel],         sizeof(LevelT) * num_levels[channel], cudaMemcpyHostToDevice));
-
-        int bins = num_levels[channel] - 1;
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_histogram[channel],  sizeof(CounterT) * bins));
-        CubDebugExit(cudaMemset(d_histogram[channel], 0,                        sizeof(CounterT) * bins));
-    }
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-
-    DispatchRange(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Allocate temporary storage with "canary" zones
-    int     canary_bytes    = 256;
-    char    canary_token    = 9;
-    char*   canary_zone     = new char[canary_bytes];
-
-    memset(canary_zone, canary_token, canary_bytes);
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + (canary_bytes * 2)));
-    CubDebugExit(cudaMemset(d_temp_storage, canary_token, temp_storage_bytes + (canary_bytes * 2)));
-
-    // Run warmup/correctness iteration
-    DispatchRange(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error,
-        ((char *) d_temp_storage) + canary_bytes, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, true);
-
-    // Check canary zones
-    int error = CompareDeviceResults(canary_zone, (char *) d_temp_storage, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-    error = CompareDeviceResults(canary_zone, ((char *) d_temp_storage) + canary_bytes + temp_storage_bytes, canary_bytes, true, g_verbose);
-    AssertEquals(0, error);
-
-    // Flush any stdout/stderr
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int channel_error = CompareDeviceResults(h_histogram[channel], d_histogram[channel], num_levels[channel] - 1, true, g_verbose);
-        printf("\tChannel %d %s", channel, channel_error ? "FAIL" : "PASS\n");
-        error |= channel_error;
-    }
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-
-    DispatchRange(
-        Int2Type<NUM_CHANNELS>(), Int2Type<NUM_ACTIVE_CHANNELS>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes,
-        d_samples, d_histogram, num_levels, d_levels,
-        num_row_pixels, num_rows, row_stride_bytes,
-        0, false);
-
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(total_samples) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(SampleT);
-        printf("\t%.3f avg ms, %.3f billion samples/s, %.3f billion bins/s, %.3f billion pixels/s, %.3f logical GB/s",
-            avg_millis,
-            giga_rate,
-            giga_rate * NUM_ACTIVE_CHANNELS / NUM_CHANNELS,
-            giga_rate / NUM_CHANNELS,
-            giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (h_samples) delete[] h_samples;
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        if (h_histogram[channel])
-            delete[] h_histogram[channel];
-
-        if (d_histogram[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_histogram[channel]));
-
-        if (d_levels[channel])
-            CubDebugExit(g_allocator.DeviceFree(d_levels[channel]));
-    }
-
-    if (d_samples) CubDebugExit(g_allocator.DeviceFree(d_samples));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, error);
-}
-
-
-/**
- * Test histogram-even
- */
-template <
-    Backend         BACKEND,
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestEven(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    LevelT lower_level[NUM_ACTIVE_CHANNELS];
-    LevelT upper_level[NUM_ACTIVE_CHANNELS];
-
-    // Find smallest level increment
-    int max_bins = max_num_levels - 1;
-    LevelT min_level_increment = max_level / max_bins;
-
-    // Set upper and lower levels for each channel
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        int num_bins = num_levels[channel] - 1;
-        lower_level[channel] = (max_level - (num_bins * min_level_increment)) / 2;
-        upper_level[channel] = (max_level + (num_bins * min_level_increment)) / 2;
-    }
-
-    // Test pointer-based samples
-    TestEvenNative<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
-
-    // Test iterator-based samples (CUB-only)
-    TestEvenIterator<CUB, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, lower_level, upper_level, num_row_pixels, num_rows, row_stride_bytes);
-}
-
-
-
-/**
- * Test histogram-range
- */
-template <
-    Backend         BACKEND,
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestRange(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    // Find smallest level increment
-    int max_bins = max_num_levels - 1;
-    LevelT min_level_increment = max_level / max_bins;
-
-    LevelT* levels[NUM_ACTIVE_CHANNELS];
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        levels[channel] = new LevelT[num_levels[channel]];
-
-        int num_bins = num_levels[channel] - 1;
-        LevelT lower_level = (max_level - (num_bins * min_level_increment)) / 2;
-
-        for (int level = 0; level < num_levels[channel]; ++level)
-            levels[channel][level] = lower_level + (level * min_level_increment);
-    }
-
-    TestRange<BACKEND, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleT, CounterT, LevelT, OffsetT>(
-        max_level, entropy_reduction, num_levels, levels, num_row_pixels, num_rows, row_stride_bytes);
-
-    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-        delete[] levels[channel];
-
-}
-
-
-
-/**
- * Test different entrypoints
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    int             num_levels[NUM_ACTIVE_CHANNELS],
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    TestEven<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-
-    TestRange<CUB, SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-}
-
-
-/**
- * Test different number of levels
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    int             entropy_reduction,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    int num_levels[NUM_ACTIVE_CHANNELS];
-
-// Unnecessary testing
-//    // All the same level
-//    for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-//    {
-//        num_levels[channel] = max_num_levels;
-//    }
-//    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-//        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-
-    // All different levels
-    num_levels[0] = max_num_levels;
-    for (int channel = 1; channel < NUM_ACTIVE_CHANNELS; ++channel)
-    {
-        num_levels[channel] = (num_levels[channel - 1] / 2) + 1;
-    }
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, max_num_levels);
-}
-
-
-
-/**
- * Test different entropy-levels
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    OffsetT         row_stride_bytes,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, 0,   max_level, max_num_levels);
-
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, -1,  max_level, max_num_levels);
-
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, 5,   max_level, max_num_levels);
-}
-
-
-/**
- * Test different row strides
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    OffsetT         num_row_pixels,
-    OffsetT         num_rows,
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    OffsetT row_stride_bytes = num_row_pixels * NUM_CHANNELS * sizeof(SampleT);
-
-    // No padding
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes, max_level, max_num_levels);
-
-    // 13 samples padding
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        num_row_pixels, num_rows, row_stride_bytes + (13 * sizeof(SampleT)), max_level, max_num_levels);
-}
-
-
-/**
- * Test different problem sizes
- */
-template <
-    typename        SampleT,
-    int             NUM_CHANNELS,
-    int             NUM_ACTIVE_CHANNELS,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void Test(
-    LevelT          max_level,
-    int             max_num_levels)
-{
-    // 0 row/col images
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(1920), OffsetT(0), max_level, max_num_levels);
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(0), OffsetT(0), max_level, max_num_levels);
-
-    // 1080 image
-    Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-        OffsetT(1920), OffsetT(1080), max_level, max_num_levels);
-
-    // Sample different aspect ratios sizes
-    for (OffsetT rows = 1; rows < 1000000; rows *= 1000)
-    {
-        for (OffsetT cols = 1; cols < (1000000 / rows); cols *= 1000)
-        {
-            Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-                cols, rows, max_level, max_num_levels);
-        }
-    }
-
-    // Randomly select linear problem size between 1:10,000,000
-    unsigned int max_int = (unsigned int) -1;
-    for (int i = 0; i < 4; ++i)
-    {
-        unsigned int num_items;
-        RandomBits(num_items);
-        num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-        num_items = CUB_MAX(1, num_items);
-
-        Test<SampleT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, CounterT, LevelT, OffsetT>(
-            OffsetT(num_items), 1, max_level, max_num_levels);
-    }
-}
-
-
-
-/**
- * Test different channel interleavings (valid specialiation)
- */
-template <
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestChannels(
-    LevelT          max_level,
-    int             max_num_levels,
-    Int2Type<true>  is_valid_tag)
-{
-    Test<SampleT, 1, 1, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 4, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 3, 3, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-    Test<SampleT, 4, 4, CounterT, LevelT, OffsetT>(max_level, max_num_levels);
-}
-
-
-/**
- * Test different channel interleavings (invalid specialiation)
- */
-template <
-    typename        SampleT,
-    typename        CounterT,
-    typename        LevelT,
-    typename        OffsetT>
-void TestChannels(
-    LevelT          max_level,
-    int             max_num_levels,
-    Int2Type<false> is_valid_tag)
-{}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_row_pixels = -1;
-    int entropy_reduction = 0;
-    int num_rows = 1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose_input = args.CheckCmdLineFlag("v2");
-    args.GetCmdLineArgument("n", num_row_pixels);
-
-    int row_stride_pixels = num_row_pixels;
-
-    args.GetCmdLineArgument("rows", num_rows);
-    args.GetCmdLineArgument("stride", row_stride_pixels);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-#if defined(QUICK_TEST) || defined(QUICKER_TEST)
-    bool compare_npp = args.CheckCmdLineFlag("npp");
-#endif
-
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<pixels per row> "
-            "[--rows=<number of rows> "
-            "[--stride=<row stride in pixels> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--entropy=<entropy-reduction factor (default 0)>]"
-            "[--v] "
-            "[--cdp]"
-            "[--npp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    if (num_row_pixels < 0)
-    {
-        num_row_pixels      = 1920 * 1080;
-        row_stride_pixels   = num_row_pixels;
-    }
-
-#if defined(QUICKER_TEST)
-
-    // Compile/run quick tests
-    {
-        // HistogramEven: unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    {
-        // HistogramEven: unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 4/4 multichannel Unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[4]       = {257, 257, 257, 257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestEven<CUB, SampleT, 4, 4, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 3/4 multichannel Unsigned char 256 bins
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[3]       = {257, 257, 257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-        if (compare_npp)
-            TestEven<NPP, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: short [0,1024] 256 bins
-        typedef unsigned short      SampleT;
-        typedef unsigned short      LevelT;
-
-        LevelT  max_level           = 1024;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: float [0,1.0] 256 bins
-        typedef float               SampleT;
-        typedef float               LevelT;
-
-        LevelT  max_level           = 1.0;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: 3/4 multichannel float [0,1.0] 256 bins
-        typedef float               SampleT;
-        typedef float               LevelT;
-
-         LevelT  max_level           = 1.0;
-         int     num_levels[3]       = {257, 257, 257};
-         int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-         TestEven<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramRange: signed char 256 bins
-        typedef signed char         SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[1]       = {257};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestRange<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramRange: 3/4 channel, unsigned char, varied bins (256, 128, 64)
-        typedef unsigned char       SampleT;
-        typedef int                 LevelT;
-
-        LevelT  max_level           = 256;
-        int     num_levels[3]       = {257, 129, 65};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 4;
-
-        TestRange<CUB, SampleT, 4, 3, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-    {
-        // HistogramEven: double [0,1.0] 64 bins
-        typedef double              SampleT;
-        typedef double              LevelT;
-
-        LevelT  max_level           = 1.0;
-        int     num_levels[1]       = {65};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-    {
-        // HistogramEven: short [0,1024] 512 bins
-        typedef unsigned short      SampleT;
-        typedef unsigned short      LevelT;
-
-        LevelT  max_level           = 1024;
-        int     num_levels[1]       = {513};
-        int     row_stride_bytes    = sizeof(SampleT) * row_stride_pixels * 1;
-
-        TestEven<CUB, SampleT, 1, 1, int, LevelT, int>(num_row_pixels, num_rows, row_stride_bytes, entropy_reduction, num_levels, max_level, num_levels[0]);
-    }
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        TestChannels <unsigned char,    int, int,   int>(256,   256 + 1, Int2Type<true>());
-        TestChannels <signed char,      int, int,   int>(256,   256 + 1, Int2Type<true>());
-        TestChannels <unsigned short,   int, int,   int>(128,   128 + 1, Int2Type<true>());
-        TestChannels <unsigned short,   int, int,   int>(8192,  8192 + 1, Int2Type<true>());
-        TestChannels <float,            int, float, int>(1.0,   256 + 1, Int2Type<true>());
-
-		// Test down-conversion of size_t offsets to int
-        TestChannels <unsigned char,    int, int,   long long>(256, 256 + 1, Int2Type<(sizeof(size_t) != sizeof(int))>());
-    }
-
-#endif
-
-    return 0;
-}
-
diff --git a/external/cub/test/test_device_radix_sort.cu b/external/cub/test/test_device_radix_sort.cu
deleted file mode 100644
index e63ca4e4b45..00000000000
--- a/external/cub/test/test_device_radix_sort.cu
+++ /dev/null
@@ -1,1275 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceRadixSort utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <algorithm>
-#include <typeinfo>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_radix_sort.cuh>
-#include <cub/device/device_segmented_radix_sort.cuh>
-
-#include "test_util.h"
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/reverse.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,                        // CUB method (allows overwriting of input)
-    CUB_NO_OVERWRITE,           // CUB method (disallows overwriting of input)
-
-    CUB_SEGMENTED,              // CUB method (allows overwriting of input)
-    CUB_SEGMENTED_NO_OVERWRITE, // CUB method (disallows overwriting of input)
-
-    THRUST,                     // Thrust method
-    CDP,                        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceRadixSort entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to CUB sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>         is_descending,
-    Int2Type<CUB>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>             is_descending,
-    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
-    int                         *d_selector,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-/**
- * Dispatch to CUB sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>          is_descending,
-    Int2Type<CUB>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-}
-
-
-/**
- * Dispatch to CUB_NO_OVERWRITE sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>              is_descending,
-    Int2Type<CUB_NO_OVERWRITE>  dispatch_to,
-    int                         *d_selector,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-//---------------------------------------------------------------------
-// Dispatch to different DeviceRadixSort entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>         is_descending,
-    Int2Type<CUB_SEGMENTED> dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceSegmentedRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for ascending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<false>                         is_descending,
-    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
-    int                                     *d_selector,
-    size_t                                  *d_temp_storage_bytes,
-    cudaError_t                             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceSegmentedRadixSort::SortPairs(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-
-/**
- * Dispatch to CUB_SEGMENTED sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>          is_descending,
-    Int2Type<CUB_SEGMENTED> dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    return DeviceSegmentedRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        d_keys, d_values,
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-}
-
-/**
- * Dispatch to CUB_SEGMENTED_NO_OVERWRITE sorting entrypoint (specialized for descending)
- */
-template <typename KeyT, typename ValueT>
-CUB_RUNTIME_FUNCTION
-__forceinline__
-cudaError_t Dispatch(
-    Int2Type<true>                          is_descending,
-    Int2Type<CUB_SEGMENTED_NO_OVERWRITE>    dispatch_to,
-    int                                     *d_selector,
-    size_t                                  *d_temp_storage_bytes,
-    cudaError_t                             *d_cdp_error,
-
-    void*                   d_temp_storage,
-    size_t&                 temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    KeyT      const *const_keys_itr     = d_keys.Current();
-    ValueT    const *const_values_itr   = d_values.Current();
-
-    cudaError_t retval = DeviceSegmentedRadixSort::SortPairsDescending(
-        d_temp_storage, temp_storage_bytes,
-        const_keys_itr, d_keys.Alternate(), const_values_itr, d_values.Alternate(),
-        num_items, num_segments, d_segment_offsets, d_segment_offsets + 1,
-        begin_bit, end_bit, stream, debug_synchronous);
-
-    d_keys.selector ^= 1;
-    d_values.selector ^= 1;
-    return retval;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch keys-only to Thrust sorting entrypoint
- */
-template <int IS_DESCENDING, typename KeyT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<THRUST>        dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<NullType>  &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyT> d_keys_wrapper(d_keys.Current());
-
-        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-        thrust::sort(d_keys_wrapper, d_keys_wrapper + num_items);
-        if (IS_DESCENDING) thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch key-value pairs to Thrust sorting entrypoint
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<THRUST>        dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyT>     d_keys_wrapper(d_keys.Current());
-        thrust::device_ptr<ValueT>   d_values_wrapper(d_values.Current());
-
-        if (IS_DESCENDING) {
-            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
-        }
-
-        thrust::sort_by_key(d_keys_wrapper, d_keys_wrapper + num_items, d_values_wrapper);
-
-        if (IS_DESCENDING) {
-            thrust::reverse(d_keys_wrapper, d_keys_wrapper + num_items);
-            thrust::reverse(d_values_wrapper, d_values_wrapper + num_items);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceRadixSort
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-__global__ void CnpDispatchKernel(
-    Int2Type<IS_DESCENDING> is_descending,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  temp_storage_bytes,
-    DoubleBuffer<KeyT>      d_keys,
-    DoubleBuffer<ValueT>    d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    bool                    debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error            = cudaErrorNotSupported;
-#else
-    *d_cdp_error            = Dispatch(
-                                is_descending, Int2Type<CUB>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-                                d_temp_storage, temp_storage_bytes, d_keys, d_values,
-                                num_items, num_segments, d_segment_offsets,
-                                begin_bit, end_bit, 0, debug_synchronous);
-    *d_temp_storage_bytes   = temp_storage_bytes;
-    *d_selector             = d_keys.selector;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <int IS_DESCENDING, typename KeyT, typename ValueT>
-cudaError_t Dispatch(
-    Int2Type<IS_DESCENDING> is_descending,
-    Int2Type<CDP>           dispatch_to,
-    int                     *d_selector,
-    size_t                  *d_temp_storage_bytes,
-    cudaError_t             *d_cdp_error,
-
-    void                    *d_temp_storage,
-    size_t                  &temp_storage_bytes,
-    DoubleBuffer<KeyT>      &d_keys,
-    DoubleBuffer<ValueT>    &d_values,
-    int                     num_items,
-    int                     num_segments,
-    const int               *d_segment_offsets,
-    int                     begin_bit,
-    int                     end_bit,
-    cudaStream_t            stream,
-    bool                    debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_descending, d_selector, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, debug_synchronous);
-
-    // Copy out selector
-    CubDebugExit(cudaMemcpy(&d_keys.selector, d_selector, sizeof(int) * 1, cudaMemcpyDeviceToHost));
-    d_values.selector = d_keys.selector;
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-
-/**
- * Simple key-value pairing
- */
-template <
-    typename KeyT,
-    typename ValueT,
-    bool IS_FLOAT = (Traits<KeyT>::CATEGORY == FLOATING_POINT)>
-struct Pair
-{
-    KeyT     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (key < b.key);
-    }
-};
-
-
-/**
- * Simple key-value pairing (specialized for bool types)
- */
-template <typename ValueT>
-struct Pair<bool, ValueT, false>
-{
-    bool     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        return (!key && b.key);
-    }
-};
-
-
-/**
- * Simple key-value pairing (specialized for floating point types)
- */
-template <typename KeyT, typename ValueT>
-struct Pair<KeyT, ValueT, true>
-{
-    KeyT     key;
-    ValueT   value;
-
-    bool operator<(const Pair &b) const
-    {
-        if (key < b.key)
-            return true;
-
-        if (key > b.key)
-            return false;
-
-        // KeyT in unsigned bits
-        typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-        // Return true if key is negative zero and b.key is positive zero
-        UnsignedBits key_bits   = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&key));
-        UnsignedBits b_key_bits = *reinterpret_cast<UnsignedBits*>(const_cast<KeyT*>(&b.key));
-        UnsignedBits HIGH_BIT   = Traits<KeyT>::HIGH_BIT;
-
-        return ((key_bits & HIGH_BIT) != 0) && ((b_key_bits & HIGH_BIT) == 0);
-    }
-};
-
-
-/**
- * Initialize key data
- */
-template <typename KeyT>
-void InitializeKeyBits(
-    GenMode         gen_mode,
-    KeyT            *h_keys,
-    int             num_items,
-    int             entropy_reduction)
-{
-    for (int i = 0; i < num_items; ++i)
-        InitValue(gen_mode, h_keys[i], i);
-}
-
-
-/**
- * Initialize solution
- */
-template <bool IS_DESCENDING, typename KeyT>
-void InitializeSolution(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit,
-    int     *&h_reference_ranks,
-    KeyT    *&h_reference_keys)
-{
-    typedef Pair<KeyT, int> PairT;
-
-    PairT *h_pairs = new PairT[num_items];
-
-    int num_bits = end_bit - begin_bit;
-    for (int i = 0; i < num_items; ++i)
-    {
-
-        // Mask off unwanted portions
-        if (num_bits < sizeof(KeyT) * 8)
-        {
-            unsigned long long base = 0;
-            memcpy(&base, &h_keys[i], sizeof(KeyT));
-            base &= ((1ull << num_bits) - 1) << begin_bit;
-            memcpy(&h_pairs[i].key, &base, sizeof(KeyT));
-        }
-        else
-        {
-            h_pairs[i].key = h_keys[i];
-        }
-
-        h_pairs[i].value = i;
-    }
-
-    printf("\nSorting reference solution on CPU (%d segments)...", num_segments); fflush(stdout);
-
-    for (int i = 0; i < num_segments; ++i)
-    {
-        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-        std::stable_sort(               h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-        if (IS_DESCENDING) std::reverse(h_pairs + h_segment_offsets[i], h_pairs + h_segment_offsets[i + 1]);
-    }
-
-    printf(" Done.\n"); fflush(stdout);
-
-    h_reference_ranks  = new int[num_items];
-    h_reference_keys   = new KeyT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-    {
-        h_reference_ranks[i]    = h_pairs[i].value;
-        h_reference_keys[i]     = h_keys[h_pairs[i].value];
-    }
-
-    if (h_pairs) delete[] h_pairs;
-}
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Test DeviceRadixSort
- */
-template <
-    Backend     BACKEND,
-    bool        IS_DESCENDING,
-    typename    KeyT,
-    typename    ValueT>
-void Test(
-    KeyT        *h_keys,
-    ValueT      *h_values,
-    int         num_items,
-    int         num_segments,
-    int         *h_segment_offsets,
-    int         begin_bit,
-    int         end_bit,
-    KeyT        *h_reference_keys,
-    ValueT      *h_reference_values)
-{
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    printf("%s %s cub::DeviceRadixSort %d items, %d segments, %d-byte keys (%s) %d-byte values (%s), descending %d, begin_bit %d, end_bit %d\n",
-        (BACKEND == CUB_NO_OVERWRITE) ? "CUB_NO_OVERWRITE" : (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (KEYS_ONLY) ? "keys-only" : "key-value",
-        num_items, num_segments,
-        (int) sizeof(KeyT), typeid(KeyT).name(), (KEYS_ONLY) ? 0 : (int) sizeof(ValueT), typeid(ValueT).name(),
-        IS_DESCENDING, begin_bit, end_bit);
-    fflush(stdout);
-
-    if (g_verbose)
-    {
-        printf("Input keys:\n");
-        DisplayResults(h_keys, num_items);
-        printf("\n\n");
-    }
-
-    // Allocate device arrays
-    DoubleBuffer<KeyT>   d_keys;
-    DoubleBuffer<ValueT> d_values;
-    int                 *d_selector;
-    int                 *d_segment_offsets;
-    size_t              *d_temp_storage_bytes;
-    cudaError_t         *d_cdp_error;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[0], sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys.d_buffers[1], sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_selector, sizeof(int) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(int) * (num_segments + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1));
-    if (!KEYS_ONLY)
-    {
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[0], sizeof(ValueT) * num_items));
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values.d_buffers[1], sizeof(ValueT) * num_items));
-    }
-
-    // Allocate temporary storage (and make it un-aligned)
-    size_t  temp_storage_bytes  = 0;
-    void    *d_temp_storage     = NULL;
-    CubDebugExit(Dispatch(
-        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, 0, true));
-
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes + 1));
-    void* mis_aligned_temp = static_cast<char*>(d_temp_storage) + 1;
-
-    // Initialize/clear device arrays
-    d_keys.selector = 0;
-    CubDebugExit(cudaMemcpy(d_keys.d_buffers[0], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_keys.d_buffers[1], 0, sizeof(KeyT) * num_items));
-    if (!KEYS_ONLY)
-    {
-        d_values.selector = 0;
-        CubDebugExit(cudaMemcpy(d_values.d_buffers[0], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemset(d_values.d_buffers[1], 0, sizeof(ValueT) * num_items));
-    }
-    CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(int) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(
-        Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-        mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
-        num_items, num_segments, d_segment_offsets,
-        begin_bit, end_bit, 0, true));
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Check for correctness (and display results, if specified)
-    printf("Warmup done.  Checking results:\n"); fflush(stdout);
-    int compare = CompareDeviceResults(h_reference_keys, d_keys.Current(), num_items, true, g_verbose);
-    printf("\t Compare keys (selector %d): %s ", d_keys.selector, compare ? "FAIL" : "PASS"); fflush(stdout);
-    if (!KEYS_ONLY)
-    {
-        int values_compare = CompareDeviceResults(h_reference_values, d_values.Current(), num_items, true, g_verbose);
-        compare |= values_compare;
-        printf("\t Compare values (selector %d): %s ", d_values.selector, values_compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-    if (BACKEND == CUB_NO_OVERWRITE)
-    {
-        // Check that input isn't overwritten
-        int input_compare = CompareDeviceResults(h_keys, d_keys.d_buffers[0], num_items, true, g_verbose);
-        compare |= input_compare;
-        printf("\t Compare input keys: %s ", input_compare ? "FAIL" : "PASS"); fflush(stdout);
-    }
-
-    // Performance
-    if (g_timing_iterations)
-        printf("\nPerforming timing iterations:\n"); fflush(stdout);
-
-    GpuTimer gpu_timer;
-    float elapsed_millis = 0.0f;
-    for (int i = 0; i < g_timing_iterations; ++i)
-    {
-        // Initialize/clear device arrays
-        CubDebugExit(cudaMemcpy(d_keys.d_buffers[d_keys.selector], h_keys, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-        CubDebugExit(cudaMemset(d_keys.d_buffers[d_keys.selector ^ 1], 0, sizeof(KeyT) * num_items));
-        if (!KEYS_ONLY)
-        {
-            CubDebugExit(cudaMemcpy(d_values.d_buffers[d_values.selector], h_values, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-            CubDebugExit(cudaMemset(d_values.d_buffers[d_values.selector ^ 1], 0, sizeof(ValueT) * num_items));
-        }
-
-        gpu_timer.Start();
-        CubDebugExit(Dispatch(
-            Int2Type<IS_DESCENDING>(), Int2Type<BACKEND>(), d_selector, d_temp_storage_bytes, d_cdp_error,
-            mis_aligned_temp, temp_storage_bytes, d_keys, d_values,
-            num_items, num_segments, d_segment_offsets,
-            begin_bit, end_bit, 0, false));
-        gpu_timer.Stop();
-        elapsed_millis += gpu_timer.ElapsedMillis();
-    }
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = (KEYS_ONLY) ?
-            giga_rate * sizeof(KeyT) * 2 :
-            giga_rate * (sizeof(KeyT) + sizeof(ValueT)) * 2;
-        printf("\n%.3f elapsed ms, %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", elapsed_millis, avg_millis, giga_rate, giga_bandwidth);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (d_keys.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[0]));
-    if (d_keys.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_keys.d_buffers[1]));
-    if (d_values.d_buffers[0]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[0]));
-    if (d_values.d_buffers[1]) CubDebugExit(g_allocator.DeviceFree(d_values.d_buffers[1]));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_selector) CubDebugExit(g_allocator.DeviceFree(d_selector));
-    if (d_segment_offsets) CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test backend
- */
-template <bool IS_DESCENDING, typename KeyT, typename ValueT>
-void TestBackend(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit,
-    KeyT    *h_reference_keys,
-    int     *h_reference_ranks)
-{
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    ValueT *h_values             = NULL;
-    ValueT *h_reference_values   = NULL;
-
-    if (!KEYS_ONLY)
-    {
-        h_values            = new ValueT[num_items];
-        h_reference_values  = new ValueT[num_items];
-
-        for (int i = 0; i < num_items; ++i)
-        {
-            InitValue(INTEGER_SEED, h_values[i], i);
-            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
-        }
-    }
-
-    if (num_segments == 1)
-    {
-        // Test single-segment implementations
-        Test<CUB, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-        Test<CUB_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-#ifdef CUB_CDP
-        Test<CDP, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-#endif
-    }
-
-    // Test multi-segment implementations
-    Test<CUB_SEGMENTED, IS_DESCENDING>(               h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-    Test<CUB_SEGMENTED_NO_OVERWRITE, IS_DESCENDING>(  h_keys, h_values, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_values);
-
-    if (h_values) delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-}
-
-
-
-
-/**
- * Test value type
- */
-template <bool IS_DESCENDING, typename KeyT>
-void TestValueTypes(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit)
-{
-    // Initialize the solution
-
-    int *h_reference_ranks = NULL;
-    KeyT *h_reference_keys = NULL;
-    InitializeSolution<IS_DESCENDING>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_ranks, h_reference_keys);
-
-    // Test keys-only
-    TestBackend<IS_DESCENDING, KeyT, NullType>          (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 8b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned char>     (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 32b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned int>      (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with 64b value
-    TestBackend<IS_DESCENDING, KeyT, unsigned long long>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Test with non-trivially-constructable value
-    TestBackend<IS_DESCENDING, KeyT, TestBar>           (h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit, h_reference_keys, h_reference_ranks);
-
-    // Cleanup
-    if (h_reference_ranks) delete[] h_reference_ranks;
-    if (h_reference_keys) delete[] h_reference_keys;
-}
-
-
-
-/**
- * Test ascending/descending
- */
-template <typename KeyT>
-void TestDirection(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    int     begin_bit,
-    int     end_bit)
-{
-    TestValueTypes<true>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-    TestValueTypes<false>(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-}
-
-
-/**
- * Test different bit ranges
- */
-template <typename KeyT>
-void TestBits(
-    KeyT    *h_keys,
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets)
-{
-    // Don't test partial-word sorting for boolean, fp, or signed types (the bit-flipping techniques get in the way)
-    if ((Traits<KeyT>::CATEGORY == UNSIGNED_INTEGER) && (!Equals<KeyT, bool>::VALUE))
-    {
-        // Partial bits
-        int begin_bit = 1;
-        int end_bit = (sizeof(KeyT) * 8) - 1;
-        printf("Testing key bits [%d,%d)\n", begin_bit, end_bit); fflush(stdout);
-        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, begin_bit, end_bit);
-
-        // Across subword boundaries
-        int mid_bit = sizeof(KeyT) * 4;
-        printf("Testing key bits [%d,%d)\n", mid_bit - 1, mid_bit + 1); fflush(stdout);
-        TestDirection(h_keys, num_items, num_segments, h_segment_offsets, mid_bit - 1, mid_bit + 1);
-    }
-
-    printf("Testing key bits [%d,%d)\n", 0, int(sizeof(KeyT)) * 8); fflush(stdout);
-    TestDirection(h_keys, num_items, num_segments, h_segment_offsets, 0, sizeof(KeyT) * 8);
-}
-
-
-/**
- * Test different segment compositions
- */
-template <typename KeyT>
-void TestSegments(
-    KeyT    *h_keys,
-    int     num_items,
-    int     max_segments)
-{
-    int *h_segment_offsets = new int[max_segments + 1];
-
-    for (int num_segments = max_segments; num_segments > 1; num_segments = (num_segments + 32 - 1) / 32)
-    {
-        if (num_items / num_segments < 128 * 1000) {
-            // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-            InitializeSegments(num_items, num_segments, h_segment_offsets);
-            TestBits(h_keys, num_items, num_segments, h_segment_offsets);
-        }
-    }
-
-    // Test single segment
-    if (num_items < 128 * 1000) {
-        // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-        InitializeSegments(num_items, 1, h_segment_offsets);
-        TestBits(h_keys, num_items, 1, h_segment_offsets);
-    }
-
-    if (h_segment_offsets) delete[] h_segment_offsets;
-}
-
-
-/**
- * Test different (sub)lengths and number of segments
- */
-template <typename KeyT>
-void TestSizes(
-    KeyT    *h_keys,
-    int     max_items,
-    int     max_segments)
-{
-    for (int num_items = max_items; num_items > 1; num_items = (num_items + 32 - 1) / 32)
-    {
-        TestSegments(h_keys, num_items, max_segments);
-    }
-    TestSegments(h_keys, 1, max_segments);
-    TestSegments(h_keys, 0, max_segments);
-}
-
-
-/**
- * Test key sampling distributions
- */
-template <typename KeyT>
-void TestGen(
-    int             max_items,
-    int             max_segments)
-{
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    if (max_items < 0)
-        max_items = (ptx_version > 100) ? 9000003 : max_items = 5000003;
-
-    if (max_segments < 0)
-        max_segments = 5003;
-
-    KeyT *h_keys = new KeyT[max_items];
-
-    for (int entropy_reduction = 0; entropy_reduction <= 6; entropy_reduction += 3)
-    {
-        printf("\nTesting random %s keys with entropy reduction factor %d\n", typeid(KeyT).name(), entropy_reduction); fflush(stdout);
-        InitializeKeyBits(RANDOM, h_keys, max_items, entropy_reduction);
-        TestSizes(h_keys, max_items, max_segments);
-    }
-
-    printf("\nTesting uniform %s keys\n", typeid(KeyT).name()); fflush(stdout);
-    InitializeKeyBits(UNIFORM, h_keys, max_items, 0);
-    TestSizes(h_keys, max_items, max_segments);
-
-    printf("\nTesting natural number %s keys\n", typeid(KeyT).name()); fflush(stdout);
-    InitializeKeyBits(INTEGER_SEED, h_keys, max_items, 0);
-    TestSizes(h_keys, max_items, max_segments);
-
-    if (h_keys) delete[] h_keys;
-}
-
-
-//---------------------------------------------------------------------
-// Simple test
-//---------------------------------------------------------------------
-
-template <
-    Backend     BACKEND,
-    typename    KeyT,
-    typename    ValueT,
-    bool        IS_DESCENDING>
-void Test(
-    int         num_items,
-    int         num_segments,
-    GenMode     gen_mode,
-    int         entropy_reduction,
-    int         begin_bit,
-    int         end_bit)
-{
-    const bool KEYS_ONLY = Equals<ValueT, NullType>::VALUE;
-
-    KeyT    *h_keys             = new KeyT[num_items];
-    int     *h_reference_ranks  = NULL;
-    KeyT    *h_reference_keys   = NULL;
-    ValueT  *h_values           = NULL;
-    ValueT  *h_reference_values = NULL;
-    int     *h_segment_offsets  = new int[num_segments + 1];
-
-    if (end_bit < 0)
-        end_bit = sizeof(KeyT) * 8;
-
-    InitializeKeyBits(gen_mode, h_keys, num_items, entropy_reduction);
-    InitializeSegments(num_items, num_segments, h_segment_offsets);
-    InitializeSolution<IS_DESCENDING>(
-        h_keys, num_items, num_segments, h_segment_offsets,
-        begin_bit, end_bit, h_reference_ranks, h_reference_keys);
-
-    if (!KEYS_ONLY)
-    {
-        h_values            = new ValueT[num_items];
-        h_reference_values  = new ValueT[num_items];
-
-        for (int i = 0; i < num_items; ++i)
-        {
-            InitValue(INTEGER_SEED, h_values[i], i);
-            InitValue(INTEGER_SEED, h_reference_values[i], h_reference_ranks[i]);
-        }
-    }
-    if (h_reference_ranks) delete[] h_reference_ranks;
-
-    printf("\nTesting bits [%d,%d) of %s keys with gen-mode %d\n", begin_bit, end_bit, typeid(KeyT).name(), gen_mode); fflush(stdout);
-    Test<BACKEND, IS_DESCENDING>(
-        h_keys, h_values,
-        num_items, num_segments, h_segment_offsets,
-        begin_bit, end_bit, h_reference_keys, h_reference_values);
-
-    if (h_keys)             delete[] h_keys;
-    if (h_reference_keys)   delete[] h_reference_keys;
-    if (h_values)           delete[] h_values;
-    if (h_reference_values) delete[] h_reference_values;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int bits = -1;
-    int num_items = -1;
-    int num_segments = -1;
-    int entropy_reduction = 0;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("s", num_segments);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("bits", bits);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--bits=<valid key bits>]"
-            "[--n=<input items> "
-            "[--s=<num segments> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--entropy=<entropy-reduction factor (default 0)>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    enum {
-        IS_DESCENDING   = false
-    };
-
-    // Compile/run basic CUB test
-    if (num_items < 0)      num_items       = 48000000;
-    if (num_segments < 0)   num_segments    = 5000;
-
-
-    Test<CUB_SEGMENTED, unsigned int,       NullType, IS_DESCENDING>(       num_items, num_segments,    RANDOM, entropy_reduction, 0, bits);
-
-    Test<CUB,           unsigned int,       NullType, IS_DESCENDING>(       num_items, 1,               RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned long long, NullType, IS_DESCENDING>(       num_items, 1,               RANDOM, entropy_reduction, 0, bits);
-
-    Test<CUB,           unsigned int,       unsigned int, IS_DESCENDING>(   num_items, 1,               RANDOM, entropy_reduction, 0, bits);
-    Test<CUB,           unsigned long long, unsigned int, IS_DESCENDING>(   num_items, 1,               RANDOM, entropy_reduction, 0, bits);
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0)      num_items       = 48000000;
-    if (num_segments < 0)   num_segments    = 5000;
-
-    // Compare CUB and thrust on 32b keys-only
-    Test<CUB, unsigned int, NullType, false> (                      num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned int, NullType, false> (                   num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    // Compare CUB and thrust on 64b keys-only
-    Test<CUB, unsigned long long, NullType, false> (                num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned long long, NullType, false> (             num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-
-    // Compare CUB and thrust on 32b key-value pairs
-    Test<CUB, unsigned int, unsigned int, false> (                  num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned int, unsigned int, false> (               num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-    // Compare CUB and thrust on 64b key-value pairs
-    Test<CUB, unsigned long long, unsigned long long, false> (      num_items, 1, RANDOM, entropy_reduction, 0, bits);
-    Test<THRUST, unsigned long long, unsigned long long, false> (   num_items, 1, RANDOM, entropy_reduction, 0, bits);
-
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        TestGen<bool>                 (num_items, num_segments);
-
-        TestGen<char>                 (num_items, num_segments);
-        TestGen<signed char>          (num_items, num_segments);
-        TestGen<unsigned char>        (num_items, num_segments);
-
-        TestGen<short>                (num_items, num_segments);
-        TestGen<unsigned short>       (num_items, num_segments);
-
-        TestGen<int>                  (num_items, num_segments);
-        TestGen<unsigned int>         (num_items, num_segments);
-
-        TestGen<long>                 (num_items, num_segments);
-        TestGen<unsigned long>        (num_items, num_segments);
-
-        TestGen<long long>            (num_items, num_segments);
-        TestGen<unsigned long long>   (num_items, num_segments);
-
-        TestGen<float>                (num_items, num_segments);
-
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            TestGen<double>           (num_items, num_segments);
-
-    }
-
-#endif
-
-    return 0;
-}
-
diff --git a/external/cub/test/test_device_reduce.cu b/external/cub/test/test_device_reduce.cu
deleted file mode 100644
index 26c663dab4c..00000000000
--- a/external/cub/test/test_device_reduce.cu
+++ /dev/null
@@ -1,1339 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <limits>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_segmented_reduce.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/iterator/transform_input_iterator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-int                     g_ptx_version;
-int                     g_sm_count;
-bool                    g_verbose           = false;
-bool                    g_verbose_input     = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-// Dispatch types
-enum Backend
-{
-    CUB,            // CUB method
-    CUB_SEGMENTED,  // CUB segmented method
-    CUB_CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-    THRUST,         // Thrust method
-};
-
-
-// Custom max functor
-struct CustomMax
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename OutputT>
-    __host__ __device__ __forceinline__ OutputT operator()(const OutputT &a, const OutputT &b)
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceReduce entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce entrypoint (custom-max)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Max-identity
-    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, num_items, reduction_op, identity,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Sum            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to min entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Min            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to max entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Max            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmin entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMin         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmax entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMax         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSegmentedReduce entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce entrypoint (custom-max)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Max-identity
-    OutputT identity = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1, reduction_op, identity,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Sum            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to min entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Min            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to max entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::Max            reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmin entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMin         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-/**
- * Dispatch to argmax entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_SEGMENTED>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    cub::ArgMax         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to device reduction directly
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes,
-            d_in, d_out, max_segments, d_segment_offsets, d_segment_offsets + 1,
-            stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduction entrypoint (min or max specialization)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT         reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        OutputT init;
-        CubDebugExit(cudaMemcpy(&init, d_in + 0, sizeof(OutputT), cudaMemcpyDeviceToHost));
-
-        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
-        OutputT retval;
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items, init, reduction_op);
-        }
-
-        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
-            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-/**
- * Dispatch to reduction entrypoint (sum specialization)
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    Sum                 reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT> d_in_wrapper(d_in);
-        OutputT retval;
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            retval = thrust::reduce(d_in_wrapper, d_in_wrapper + num_items);
-        }
-
-        if (!Equals<OutputIteratorT, DiscardOutputIterator<int> >::VALUE)
-            CubDebugExit(cudaMemcpy(d_out, &retval, sizeof(OutputT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA nested-parallelism test kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceReduce
- */
-template <
-    typename            InputIteratorT,
-    typename            OutputIteratorT,
-    typename            OffsetIteratorT,
-    typename            ReductionOpT>
-__global__ void CnpDispatchKernel(
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CUB_CDP kernel
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetIteratorT, typename ReductionOpT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB_CDP>       dispatch_to,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    int                 num_items,
-    int                 max_segments,
-    OffsetIteratorT     d_segment_offsets,
-    ReductionOpT        reduction_op,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, max_segments, d_segment_offsets, reduction_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-/// Initialize problem
-template <typename InputT>
-void Initialize(
-    GenMode         gen_mode,
-    InputT          *h_in,
-    int             num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-
-    if (g_verbose_input)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/// Solve problem (max/custom-max functor)
-template <typename ReductionOpT, typename InputT, typename _OutputT>
-struct Solution
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        ReductionOpT reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate = Traits<InputT>::Lowest(); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-/// Solve problem (min functor)
-template <typename InputT, typename _OutputT>
-struct Solution<cub::Min, InputT, _OutputT>
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::Min reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate = Traits<InputT>::Max();    // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-/// Solve problem (sum functor)
-template <typename InputT, typename _OutputT>
-struct Solution<cub::Sum, InputT, _OutputT>
-{
-    typedef _OutputT OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::Sum reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate;
-            InitValue(INTEGER_SEED, aggregate, 0);
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-                aggregate = reduction_op(aggregate, OutputT(h_in[j]));
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-/// Solve problem (argmin functor)
-template <typename InputValueT, typename OutputValueT>
-struct Solution<cub::ArgMin, InputValueT, OutputValueT>
-{
-    typedef KeyValuePair<int, OutputValueT> OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::ArgMin reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate(1, Traits<InputValueT>::Max()); // replace with std::numeric_limits<OutputT>::max() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-            {
-                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
-                aggregate = reduction_op(aggregate, item);
-            }
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-/// Solve problem (argmax functor)
-template <typename InputValueT, typename OutputValueT>
-struct Solution<cub::ArgMax, InputValueT, OutputValueT>
-{
-    typedef KeyValuePair<int, OutputValueT> OutputT;
-
-    template <typename HostInputIteratorT, typename OffsetT, typename OffsetIteratorT>
-    static void Solve(HostInputIteratorT h_in, OutputT *h_reference, OffsetT num_segments, OffsetIteratorT h_segment_offsets,
-        cub::ArgMax reduction_op)
-    {
-        for (int i = 0; i < num_segments; ++i)
-        {
-            OutputT aggregate(1, Traits<InputValueT>::Lowest()); // replace with std::numeric_limits<OutputT>::lowest() when C++ support is more prevalent
-            for (int j = h_segment_offsets[i]; j < h_segment_offsets[i + 1]; ++j)
-            {
-                OutputT item(j - h_segment_offsets[i], OutputValueT(h_in[j]));
-                aggregate = reduction_op(aggregate, item);
-            }
-            h_reference[i] = aggregate;
-        }
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Problem generation
-//---------------------------------------------------------------------
-
-/// Test DeviceReduce for a given problem input
-template <
-    typename                BackendT,
-    typename                DeviceInputIteratorT,
-    typename                HostReferenceIteratorT,
-    typename                OffsetT,
-    typename                OffsetIteratorT,
-    typename                ReductionOpT>
-void Test(
-    BackendT                backend,
-    DeviceInputIteratorT    d_in,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         d_segment_offsets,
-    ReductionOpT            reduction_op,
-    HostReferenceIteratorT  h_reference)
-{
-    // Input and output data types
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputT;
-    typedef typename std::iterator_traits<HostReferenceIteratorT>::value_type   OutputT;
-
-    // Allocate CUB_CDP device arrays for temp storage size and error
-    OutputT         *d_out = NULL;
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,                 sizeof(OutputT) * num_segments));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Inquire temp device storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(backend, 1,
-        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, num_segments, d_segment_offsets,
-        reduction_op, 0, true));
-
-    // Allocate temp device storage
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_segments));
-
-    // Run once with discard iterator
-    DiscardOutputIterator<OffsetT> discard_itr;
-    CubDebugExit(Dispatch(backend, 1,
-        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, discard_itr, num_items, num_segments, d_segment_offsets,
-        reduction_op, 0, true));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(backend, 1,
-        d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-        d_in, d_out, num_items, num_segments, d_segment_offsets,
-        reduction_op, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_segments, g_verbose, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    if (g_timing_iterations > 0)
-    {
-        GpuTimer gpu_timer;
-        gpu_timer.Start();
-
-        CubDebugExit(Dispatch(backend, g_timing_iterations,
-            d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes,
-            d_in, d_out, num_items, num_segments, d_segment_offsets,
-            reduction_op, 0, false));
-
-        gpu_timer.Stop();
-        float elapsed_millis = gpu_timer.ElapsedMillis();
-
-        // Display performance
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * sizeof(InputT);
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/// Test DeviceReduce
-template <
-    Backend                 BACKEND,
-    typename                OutputValueT,
-    typename                HostInputIteratorT,
-    typename                DeviceInputIteratorT,
-    typename                OffsetT,
-    typename                OffsetIteratorT,
-    typename                ReductionOpT>
-void SolveAndTest(
-    HostInputIteratorT      h_in,
-    DeviceInputIteratorT    d_in,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         h_segment_offsets,
-    OffsetIteratorT         d_segment_offsets,
-    ReductionOpT            reduction_op)
-{
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type     InputValueT;
-    typedef Solution<ReductionOpT, InputValueT, OutputValueT>                   SolutionT;
-    typedef typename SolutionT::OutputT                                         OutputT;
-
-    printf("\n\n%s cub::DeviceReduce<%s> %d items (%s), %d segments\n",
-        (BACKEND == CUB_CDP) ? "CUB_CDP" : (BACKEND == THRUST) ? "Thrust" : (BACKEND == CUB_SEGMENTED) ? "CUB_SEGMENTED" : "CUB",
-        typeid(ReductionOpT).name(), num_items, typeid(HostInputIteratorT).name(), num_segments);
-    fflush(stdout);
-
-    // Allocate and solve solution
-    OutputT *h_reference = new OutputT[num_segments];
-    SolutionT::Solve(h_in, h_reference, num_segments, h_segment_offsets, reduction_op);
-
-    // Run test
-    Test(Int2Type<BACKEND>(), d_in, num_items, num_segments, d_segment_offsets, reduction_op, h_reference);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-}
-
-
-/// Test specific problem type
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        OffsetT,
-    typename        ReductionOpT>
-void TestProblem(
-    OffsetT         num_items,
-    OffsetT         num_segments,
-    GenMode         gen_mode,
-    ReductionOpT    reduction_op)
-{
-    printf("\n\nInitializing %d %s->%s (gen mode %d)... ", num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
-    fflush(stdout);
-
-    // Initialize value data
-    InputT* h_in = new InputT[num_items];
-    Initialize(gen_mode, h_in, num_items);
-
-    // Initialize segment data
-    OffsetT *h_segment_offsets = new OffsetT[num_segments + 1];
-    InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
-
-    // Initialize device data
-    OffsetT *d_segment_offsets      = NULL;
-    InputT  *d_in                   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in,              sizeof(InputT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (num_segments + 1)));
-    CubDebugExit(cudaMemcpy(d_in,               h_in,                   sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_segment_offsets,  h_segment_offsets,      sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, reduction_op);
-
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-    if (h_in)               delete[] h_in;
-    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/// Test different operators
-template <
-    Backend             BACKEND,
-    typename            OutputT,
-    typename            HostInputIteratorT,
-    typename            DeviceInputIteratorT,
-    typename            OffsetT,
-    typename            OffsetIteratorT>
-void TestByOp(
-    HostInputIteratorT      h_in,
-    DeviceInputIteratorT    d_in,
-    OffsetT                 num_items,
-    OffsetT                 num_segments,
-    OffsetIteratorT         h_segment_offsets,
-    OffsetIteratorT         d_segment_offsets)
-{
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, CustomMax());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Sum());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Min());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMin());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, Max());
-    SolveAndTest<BACKEND, OutputT>(h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets, ArgMax());
-}
-
-
-/// Test different backends
-template <
-    typename    InputT,
-    typename    OutputT,
-    typename    OffsetT>
-void TestByBackend(
-    OffsetT     num_items,
-    OffsetT     max_segments,
-    GenMode     gen_mode)
-{
-    // Initialize host data
-    printf("\n\nInitializing %d %s -> %s (gen mode %d)... ",
-        num_items, typeid(InputT).name(), typeid(OutputT).name(), gen_mode); fflush(stdout);
-
-    InputT  *h_in               = new InputT[num_items];
-    OffsetT *h_segment_offsets  = new OffsetT[max_segments + 1];
-    Initialize(gen_mode, h_in, num_items);
-
-    // Initialize device data
-    InputT  *d_in               = NULL;
-    OffsetT *d_segment_offsets  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_segment_offsets, sizeof(OffsetT) * (max_segments + 1)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-
-    //
-    // Test single-segment implementations
-    //
-
-    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
-
-    // Page-aligned-input tests
-    TestByOp<CUB, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);                 // Host-dispatch
-#ifdef CUB_CDP
-    TestByOp<CUB_CDP, OutputT>(h_in, d_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL);             // Device-dispatch
-#endif
-
-    // Non-page-aligned-input tests
-    if (num_items > 1)
-    {
-        InitializeSegments(num_items - 1, 1, h_segment_offsets, g_verbose_input);
-        TestByOp<CUB, OutputT>(h_in + 1, d_in + 1, num_items - 1, 1, h_segment_offsets, (OffsetT*) NULL);
-    }
-
-    //
-    // Test segmented implementation
-    //
-
-    // Right now we assign a single thread block to each segment, so lets keep it to under 128K items per segment
-    int max_items_per_segment = 128000;
-
-    for (int num_segments = (num_items + max_items_per_segment - 1) / max_items_per_segment;
-        num_segments < max_segments;
-        num_segments = (num_segments * 32) + 1)
-    {
-        // Test with segment pointer
-        InitializeSegments(num_items, num_segments, h_segment_offsets, g_verbose_input);
-        CubDebugExit(cudaMemcpy(d_segment_offsets, h_segment_offsets, sizeof(OffsetT) * (num_segments + 1), cudaMemcpyHostToDevice));
-        TestByOp<CUB_SEGMENTED, OutputT>(
-            h_in, d_in, num_items, num_segments, h_segment_offsets, d_segment_offsets);
-
-        // Test with segment iterator
-        typedef CastOp<OffsetT> IdentityOpT;
-        IdentityOpT identity_op;
-        TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> h_segment_offsets_itr(
-            h_segment_offsets,
-            identity_op);
-       TransformInputIterator<OffsetT, IdentityOpT, OffsetT*, OffsetT> d_segment_offsets_itr(
-            d_segment_offsets,
-            identity_op);
-
-        TestByOp<CUB_SEGMENTED, OutputT>(
-            h_in, d_in, num_items, num_segments, h_segment_offsets_itr, d_segment_offsets_itr);
-    }
-
-    if (h_in)               delete[] h_in;
-    if (h_segment_offsets)  delete[] h_segment_offsets;
-    if (d_in)               CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_segment_offsets)  CubDebugExit(g_allocator.DeviceFree(d_segment_offsets));
-}
-
-
-/// Test different input-generation modes
-template <
-    typename InputT,
-    typename OutputT,
-    typename OffsetT>
-void TestByGenMode(
-    OffsetT num_items,
-    OffsetT max_segments)
-{
-    //
-    // Test pointer support using different input-generation modes
-    //
-
-    TestByBackend<InputT, OutputT>(num_items, max_segments, UNIFORM);
-    TestByBackend<InputT, OutputT>(num_items, max_segments, INTEGER_SEED);
-    TestByBackend<InputT, OutputT>(num_items, max_segments, RANDOM);
-
-    //
-    // Test iterator support using a constant-iterator and SUM
-    //
-
-    InputT val;
-    InitValue(UNIFORM, val, 0);
-    ConstantInputIterator<InputT, OffsetT> h_in(val);
-
-    OffsetT *h_segment_offsets = new OffsetT[1 + 1];
-    InitializeSegments(num_items, 1, h_segment_offsets, g_verbose_input);
-
-    SolveAndTest<CUB, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
-#ifdef CUB_CDP
-    SolveAndTest<CUB_CDP, OutputT>(h_in, h_in, num_items, 1, h_segment_offsets, (OffsetT*) NULL, Sum());
-#endif
-
-    if (h_segment_offsets) delete[] h_segment_offsets;
-}
-
-
-/// Test different problem sizes
-template <
-    typename InputT,
-    typename OutputT,
-    typename OffsetT>
-struct TestBySize
-{
-    OffsetT max_items;
-    OffsetT max_segments;
-
-    TestBySize(OffsetT max_items, OffsetT max_segments) :
-        max_items(max_items),
-        max_segments(max_segments)
-    {}
-
-    template <typename ActivePolicyT>
-    cudaError_t Invoke()
-    {
-        //
-        // Black-box testing on all backends
-        //
-
-        // Test 0, 1, many
-        TestByGenMode<InputT, OutputT>(0,           max_segments);
-        TestByGenMode<InputT, OutputT>(1,           max_segments);
-        TestByGenMode<InputT, OutputT>(max_items,   max_segments);
-
-        // Test random problem sizes from a log-distribution [8, max_items-ish)
-        int     num_iterations = 8;
-        double  max_exp = log(double(max_items)) / log(double(2.0));
-        for (int i = 0; i < num_iterations; ++i)
-        {
-            OffsetT num_items = (OffsetT) pow(2.0, RandomValue(max_exp - 3.0) + 3.0);
-            TestByGenMode<InputT, OutputT>(num_items, max_segments);
-        }
-
-        //
-        // White-box testing of single-segment problems around specific sizes
-        //
-
-        // Tile-boundaries: multiple blocks, one tile per block
-        OffsetT tile_size = ActivePolicyT::ReducePolicy::BLOCK_THREADS * ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD;
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(tile_size * 4 - 1, 1,   RANDOM, Sum());
-
-        // Tile-boundaries: multiple blocks, multiple tiles per block
-        OffsetT sm_occupancy = 32;
-        OffsetT occupancy = tile_size * sm_occupancy * g_sm_count;
-        TestProblem<CUB, InputT, OutputT>(occupancy,  1,      RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy + 1, 1,   RANDOM, Sum());
-        TestProblem<CUB, InputT, OutputT>(occupancy - 1, 1,   RANDOM, Sum());
-
-        return cudaSuccess;
-    }
-};
-
-
-/// Test problem type
-template <
-    typename    InputT,
-    typename    OutputT,
-    typename    OffsetT>
-void TestType(
-    OffsetT     max_items,
-    OffsetT     max_segments)
-{
-    typedef typename DeviceReducePolicy<OutputT, OffsetT, cub::Sum>::MaxPolicy MaxPolicyT;
-
-    TestBySize<InputT, OutputT, OffsetT> dispatch(max_items, max_segments);
-
-    MaxPolicyT::Invoke(g_ptx_version, dispatch);
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    typedef int OffsetT;
-
-    OffsetT max_items       = 27000000;
-    OffsetT max_segments    = 34000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_verbose_input = args.CheckCmdLineFlag("v2");
-    args.GetCmdLineArgument("n", max_items);
-    args.GetCmdLineArgument("s", max_segments);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--s=<num segments> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    CubDebugExit(PtxVersion(g_ptx_version));
-
-    // Get SM count
-    g_sm_count = args.deviceProp.multiProcessorCount;
-
-    std::numeric_limits<float>::max();
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic test
-
-
-
-    TestProblem<CUB, int, int>(     max_items, 1, RANDOM, Sum());
-
-    TestProblem<CUB, char, int>(    max_items, 1, RANDOM, Sum());
-
-    TestProblem<CUB, int, int>(     max_items, 1, RANDOM, ArgMax());
-
-    TestProblem<CUB, float, float>( max_items, 1, RANDOM, Sum());
-
-    TestProblem<CUB_SEGMENTED, int, int>(max_items, max_segments, RANDOM, Sum());
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick comparison tests
-
-    TestProblem<CUB, char, char>(         max_items * 4, 1, UNIFORM, Sum());
-    TestProblem<THRUST, char, char>(      max_items * 4, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, short, short>(        max_items * 2, 1, UNIFORM, Sum());
-    TestProblem<THRUST, short, short>(     max_items * 2, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, int, int>(          max_items,     1, UNIFORM, Sum());
-    TestProblem<THRUST, int, int>(       max_items,     1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, long long, long long>(    max_items / 2, 1, UNIFORM, Sum());
-    TestProblem<THRUST, long long, long long>( max_items / 2, 1, UNIFORM, Sum());
-
-    printf("\n----------------------------\n");
-    TestProblem<CUB, TestFoo, TestFoo>(      max_items / 4, 1, UNIFORM, Max());
-    TestProblem<THRUST, TestFoo, TestFoo>(   max_items / 4, 1, UNIFORM, Max());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        TestType<char, char>(max_items, max_segments);
-
-        TestType<unsigned char, unsigned char>(max_items, max_segments);
-
-        TestType<char, int>(max_items, max_segments);
-
-        TestType<short, short>(max_items, max_segments);
-        TestType<int, int>(max_items, max_segments);
-        TestType<long, long>(max_items, max_segments);
-        TestType<long long, long long>(max_items, max_segments);
-
-        TestType<uchar2, uchar2>(max_items, max_segments);
-        TestType<uint2, uint2>(max_items, max_segments);
-        TestType<ulonglong2, ulonglong2>(max_items, max_segments);
-        TestType<ulonglong4, ulonglong4>(max_items, max_segments);
-
-        TestType<TestFoo, TestFoo>(max_items, max_segments);
-        TestType<TestBar, TestBar>(max_items, max_segments);
-
-    }
-
-#endif
-
-
-    printf("\n");
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_device_reduce_by_key.cu b/external/cub/test/test_device_reduce_by_key.cu
deleted file mode 100644
index 4d9c4726949..00000000000
--- a/external/cub/test/test_device_reduce_by_key.cu
+++ /dev/null
@@ -1,853 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce::ReduceByKey utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/iterator/constant_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/thread/thread_operators.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce-by-key entrypoint
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                  equality_op,
-    ReductionOpT                 reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceReduce::ReduceByKey(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_keys_out,
-            d_values_in,
-            d_values_out,
-            d_num_runs,
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to reduce-by-key entrypoint
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input keys type
-    typedef typename std::iterator_traits<KeyInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<KeyOutputIteratorT>::value_type, void>::VALUE),   // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeyInputIteratorT>::value_type,                                           // ... then the input iterator's value type,
-        typename std::iterator_traits<KeyOutputIteratorT>::value_type>::Type KeyOutputT;                        // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValueInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<ValueOutputIteratorT>::value_type, void>::VALUE), // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValueInputIteratorT>::value_type,                                         // ... then the input iterator's value type,
-        typename std::iterator_traits<ValueOutputIteratorT>::value_type>::Type ValueOuputT;                     // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<KeyInputT> d_keys_in_wrapper(d_keys_in);
-        thrust::device_ptr<KeyOutputT> d_keys_out_wrapper(d_keys_out);
-
-        thrust::device_ptr<ValueInputT> d_values_in_wrapper(d_values_in);
-        thrust::device_ptr<ValueOuputT> d_values_out_wrapper(d_values_out);
-
-        thrust::pair<thrust::device_ptr<KeyOutputT>, thrust::device_ptr<ValueOuputT> > d_out_ends;
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_ends = thrust::reduce_by_key(
-                d_keys_in_wrapper,
-                d_keys_in_wrapper + num_items,
-                d_values_in_wrapper,
-                d_keys_out_wrapper,
-                d_values_out_wrapper);
-        }
-
-        OffsetT num_segments = OffsetT(d_out_ends.first - d_keys_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_runs, &num_segments, sizeof(OffsetT), cudaMemcpyHostToDevice));
-
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <
-    typename                    KeyInputIteratorT,
-    typename                    KeyOutputIteratorT,
-    typename                    ValueInputIteratorT,
-    typename                    ValueOutputIteratorT,
-    typename                    NumRunsIteratorT,
-    typename                    EqualityOpT,
-    typename                    ReductionOpT,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    KeyInputIteratorT           d_keys_in,
-    KeyOutputIteratorT          d_keys_out,
-    ValueInputIteratorT         d_values_in,
-    ValueOutputIteratorT        d_values_out,
-    NumRunsIteratorT            d_num_runs,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences
-
-        int repeat;
-
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve problem.  Returns total number of segments identified
- */
-template <
-    typename        KeyInputIteratorT,
-    typename        ValueInputIteratorT,
-    typename        KeyT,
-    typename        ValueT,
-    typename        EqualityOpT,
-    typename        ReductionOpT>
-int Solve(
-    KeyInputIteratorT       h_keys_in,
-    KeyT                    *h_keys_reference,
-    ValueInputIteratorT     h_values_in,
-    ValueT                  *h_values_reference,
-    EqualityOpT             equality_op,
-    ReductionOpT            reduction_op,
-    int                     num_items)
-{
-    // First item
-    KeyT previous        = h_keys_in[0];
-    ValueT aggregate     = h_values_in[0];
-    int num_segments    = 0;
-
-    // Subsequent items
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (!equality_op(previous, h_keys_in[i]))
-        {
-            h_keys_reference[num_segments] = previous;
-            h_values_reference[num_segments] = aggregate;
-            num_segments++;
-            aggregate = h_values_in[i];
-        }
-        else
-        {
-            aggregate = reduction_op(aggregate, h_values_in[i]);
-        }
-        previous = h_keys_in[i];
-    }
-
-    h_keys_reference[num_segments] = previous;
-    h_values_reference[num_segments] = aggregate;
-    num_segments++;
-
-    return num_segments;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceKeyInputIteratorT,
-    typename            DeviceValueInputIteratorT,
-    typename            KeyT,
-    typename            ValueT,
-    typename            EqualityOpT,
-    typename            ReductionOpT>
-void Test(
-    DeviceKeyInputIteratorT     d_keys_in,
-    DeviceValueInputIteratorT   d_values_in,
-    KeyT*                       h_keys_reference,
-    ValueT*                     h_values_reference,
-    EqualityOpT                 equality_op,
-    ReductionOpT                reduction_op,
-    int                         num_segments,
-    int                         num_items)
-{
-    // Allocate device output arrays and number of segments
-    KeyT*   d_keys_out             = NULL;
-    ValueT* d_values_out           = NULL;
-    int*    d_num_runs         = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output arrays
-    CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items));
-    CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items));
-    CubDebugExit(cudaMemset(d_num_runs, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = CompareDeviceResults(h_keys_reference, d_keys_out, num_segments, true, g_verbose);
-    printf("\t Keys %s ", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(h_values_reference, d_values_out, num_segments, true, g_verbose);
-    printf("\t Values %s ", compare2 ? "FAIL" : "PASS");
-
-    int compare3 = CompareDeviceResults(&num_segments, d_num_runs, 1, true, g_verbose);
-    printf("\t Count %s ", compare3 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_runs, equality_op, reduction_op, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float   avg_millis  = elapsed_millis / g_timing_iterations;
-        float   giga_rate   = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int     bytes_moved = ((num_items + num_segments) * sizeof(KeyT)) + ((num_items + num_segments) * sizeof(ValueT));
-        float   giga_bandwidth  = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out));
-    if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out));
-    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2 | compare3);
-}
-
-
-/**
- * Test DeviceSelect on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment,
-    ReductionOpT    reduction_op)
-{
-    // Allocate host arrays
-    KeyT* h_keys_in        = new KeyT[num_items];
-    KeyT* h_keys_reference = new KeyT[num_items];
-
-    ValueT* h_values_in        = new ValueT[num_items];
-    ValueT* h_values_reference = new ValueT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-        InitValue(INTEGER_SEED, h_values_in[i], 1);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
-    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
-
-    printf("\nPointer %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
-        num_items, num_segments, float(num_items) / num_segments,
-        typeid(KeyT).name(), typeid(ValueT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    KeyT     *d_keys_in = NULL;
-    ValueT   *d_values_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_keys_in, d_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
-
-    // Cleanup
-    if (h_keys_in) delete[] h_keys_in;
-    if (h_values_in) delete[] h_values_in;
-    if (h_keys_reference) delete[] h_keys_reference;
-    if (h_values_reference) delete[] h_values_reference;
-    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
-    if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestIterator(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment,
-    ReductionOpT    reduction_op)
-{
-    // Allocate host arrays
-    KeyT* h_keys_in        = new KeyT[num_items];
-    KeyT* h_keys_reference = new KeyT[num_items];
-
-    ValueT one_val;
-    InitValue(INTEGER_SEED, one_val, 1);
-    ConstantInputIterator<ValueT, int> h_values_in(one_val);
-    ValueT* h_values_reference = new ValueT[num_items];
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_keys_in, num_items, max_segment);
-    int num_segments = Solve(h_keys_in, h_keys_reference, h_values_in, h_values_reference, equality_op, reduction_op, num_items);
-
-    printf("\nIterator %s cub::DeviceReduce::ReduceByKey %s reduction of %d items, %d segments (avg run length %.3f), {%s,%s} key value pairs, max_segment %d, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<ReductionOpT, Sum>::VALUE) ? "Sum" : "Max",
-        num_items, num_segments, float(num_items) / num_segments,
-        typeid(KeyT).name(), typeid(ValueT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    KeyT     *d_keys_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_keys_in, h_values_in, h_keys_reference, h_values_reference, equality_op, reduction_op, num_segments, num_items);
-
-    // Cleanup
-    if (h_keys_in) delete[] h_keys_in;
-    if (h_keys_reference) delete[] h_keys_reference;
-    if (h_values_reference) delete[] h_values_reference;
-    if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in));
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void Test(
-    int             num_items,
-    ReductionOpT    reduction_op,
-    int             max_segment)
-{
-    // 0 key-bit entropy reduction rounds
-    TestPointer<BACKEND, KeyT, ValueT>(num_items, 0, max_segment, reduction_op);
-
-    if (max_segment > 1)
-    {
-        // 2 key-bit entropy reduction rounds
-        TestPointer<BACKEND, KeyT, ValueT>(num_items, 2, max_segment, reduction_op);
-
-        // 7 key-bit entropy reduction rounds
-        TestPointer<BACKEND, KeyT, ValueT>(num_items, 7, max_segment, reduction_op);
-    }
-}
-
-
-/**
- * Test different avg segment lengths modes
- */
-template <
-    Backend         BACKEND,
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void Test(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, -1);
-    Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, 1);
-
-    // Evaluate different max-segment lengths
-    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 11)
-    {
-        Test<BACKEND, KeyT, ValueT>(num_items, reduction_op, max_segment);
-    }
-}
-
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestDispatch(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    Test<CUB, KeyT, ValueT>(num_items, reduction_op);
-#ifdef CUB_CDP
-    Test<CDP, KeyT, ValueT>(num_items, reduction_op);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename        KeyT,
-    typename        ValueT,
-    typename        ReductionOpT>
-void TestSize(
-    int             num_items,
-    ReductionOpT    reduction_op)
-{
-    if (num_items < 0)
-    {
-        TestDispatch<KeyT, ValueT>(1,        reduction_op);
-        TestDispatch<KeyT, ValueT>(100,      reduction_op);
-        TestDispatch<KeyT, ValueT>(10000,    reduction_op);
-        TestDispatch<KeyT, ValueT>(1000000,  reduction_op);
-    }
-    else
-    {
-        TestDispatch<KeyT, ValueT>(num_items, reduction_op);
-    }
-
-}
-
-
-template <
-    typename        KeyT,
-    typename        ValueT>
-void TestOp(
-    int             num_items)
-{
-    TestSize<KeyT, ValueT>(num_items, cub::Sum());
-    TestSize<KeyT, ValueT>(num_items, cub::Max());
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int maxseg              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", maxseg);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    printf("\n");
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("---- RLE int ---- \n");
-    TestIterator<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- RLE long long ---- \n");
-    TestIterator<CUB, long long, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- int ---- \n");
-    TestPointer<CUB, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<THRUST, int, int>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    printf("---- float ---- \n");
-    TestPointer<CUB, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
-    TestPointer<THRUST, int, float>(num_items, entropy_reduction, maxseg, cub::Sum());
-
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-    {
-        printf("---- double ---- \n");
-        TestPointer<CUB, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-        TestPointer<THRUST, int, double>(num_items, entropy_reduction, maxseg, cub::Sum());
-    }
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-
-        // Test different input types
-        TestOp<int, char>(num_items);
-        TestOp<int, short>(num_items);
-        TestOp<int, int>(num_items);
-        TestOp<int, long>(num_items);
-        TestOp<int, long long>(num_items);
-        TestOp<int, float>(num_items);
-        if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-            TestOp<int, double>(num_items);
-
-        TestOp<int, uchar2>(num_items);
-        TestOp<int, uint2>(num_items);
-        TestOp<int, uint3>(num_items);
-        TestOp<int, uint4>(num_items);
-        TestOp<int, ulonglong4>(num_items);
-        TestOp<int, TestFoo>(num_items);
-        TestOp<int, TestBar>(num_items);
-
-        TestOp<char, int>(num_items);
-        TestOp<long long, int>(num_items);
-        TestOp<TestFoo, int>(num_items);
-        TestOp<TestBar, int>(num_items);
-
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_device_run_length_encode.cu b/external/cub/test/test_device_run_length_encode.cu
deleted file mode 100644
index 0be20ce2189..00000000000
--- a/external/cub/test/test_device_run_length_encode.cu
+++ /dev/null
@@ -1,890 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceReduce::RunLengthEncode utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-#include <thrust/iterator/constant_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/device/device_reduce.cuh>
-#include <cub/device/device_run_length_encode.cuh>
-#include <cub/thread/thread_operators.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-// Operation types
-enum RleMethod
-{
-    RLE,                // Run length encode
-    NON_TRIVIAL,
-    CSR,
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to run-length encode entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<RLE>               method,
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceRunLengthEncode::Encode(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to non-trivial runs entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<NON_TRIVIAL>       method,
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceRunLengthEncode::NonTrivialRuns(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-    return error;
-}
-
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to run-length encode entrypoint
- */
-template <
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    OffsetT>
-cudaError_t Dispatch(
-    Int2Type<RLE>               method,
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type UniqueT;                          // ... else the output iterator's value type
-
-    // The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
-        thrust::device_ptr<UniqueT>     d_unique_out_wrapper(d_unique_out);
-        thrust::device_ptr<LengthT>     d_lengths_out_wrapper(d_lengths_out);
-
-        thrust::pair<thrust::device_ptr<UniqueT>, thrust::device_ptr<LengthT> > d_out_ends;
-
-        LengthT one_val;
-        InitValue(INTEGER_SEED, one_val, 1);
-        thrust::constant_iterator<LengthT> constant_one(one_val);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_ends = thrust::reduce_by_key(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                constant_one,
-                d_unique_out_wrapper,
-                d_lengths_out_wrapper);
-        }
-
-        OffsetT num_runs = OffsetT(d_out_ends.first - d_unique_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_runs, &num_runs, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceRunLengthEncode
- */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-__global__ void CnpDispatchKernel(
-    Int2Type<RLE_METHOD>            method,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    cub::Equality               equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(method, Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <
-    int                         RLE_METHOD,
-    typename                    InputIteratorT,
-    typename                    UniqueOutputIteratorT,
-    typename                    OffsetsOutputIteratorT,
-    typename                    LengthsOutputIteratorT,
-    typename                    NumRunsIterator,
-    typename                    EqualityOp,
-    typename                    OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<RLE_METHOD>        method,
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    UniqueOutputIteratorT       d_unique_out,
-    OffsetsOutputIteratorT      d_offsets_out,
-    LengthsOutputIteratorT      d_lengths_out,
-    NumRunsIterator             d_num_runs,
-    EqualityOp                  equality_op,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(method, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences for the current run
-        int repeat;
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve problem.  Returns total number of segments identified
- */
-template <
-    RleMethod       RLE_METHOD,
-    typename        InputIteratorT,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT,
-    typename        EqualityOp>
-int Solve(
-    InputIteratorT  h_in,
-    T               *h_unique_reference,
-    OffsetT         *h_offsets_reference,
-    LengthT         *h_lengths_reference,
-    EqualityOp      equality_op,
-    int             num_items)
-{
-    if (num_items == 0) 
-        return 0;
-
-    // First item
-    T       previous        = h_in[0];
-    LengthT  length          = 1;
-    int     num_runs        = 0;
-    int     run_begin       = 0;
-
-    // Subsequent items
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (!equality_op(previous, h_in[i]))
-        {
-            if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
-            {
-                h_unique_reference[num_runs]      = previous;
-                h_offsets_reference[num_runs]     = run_begin;
-                h_lengths_reference[num_runs]     = length;
-                num_runs++;
-            }
-            length = 1;
-            run_begin = i;
-        }
-        else
-        {
-            length++;
-        }
-        previous = h_in[i];
-    }
-
-    if ((RLE_METHOD != NON_TRIVIAL) || (length > 1))
-    {
-        h_unique_reference[num_runs]    = previous;
-        h_offsets_reference[num_runs]   = run_begin;
-        h_lengths_reference[num_runs]   = length;
-        num_runs++;
-    }
-
-    return num_runs;
-}
-
-
-
-/**
- * Test DeviceRunLengthEncode for a given problem input
- */
-template <
-    RleMethod           RLE_METHOD,
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            T,
-    typename            OffsetT,
-    typename            LengthT,
-    typename            EqualityOp>
-void Test(
-    DeviceInputIteratorT d_in,
-    T                   *h_unique_reference,
-    OffsetT             *h_offsets_reference,
-    LengthT             *h_lengths_reference,
-    EqualityOp          equality_op,
-    int                 num_runs,
-    int                 num_items)
-{
-    // Allocate device output arrays and number of segments
-    T*          d_unique_out       = NULL;
-    LengthT*    d_offsets_out      = NULL;
-    OffsetT*    d_lengths_out      = NULL;
-    int*        d_num_runs         = NULL;
-
-    if (RLE_METHOD == RLE)
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_unique_out, sizeof(T) * num_items));
-    if (RLE_METHOD == NON_TRIVIAL)
-        CubDebugExit(g_allocator.DeviceAllocate((void**)&d_offsets_out, sizeof(OffsetT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_lengths_out, sizeof(LengthT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_runs, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t*          d_temp_storage_bytes = NULL;
-    cudaError_t*     d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void*           d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output arrays
-    if (RLE_METHOD == RLE)
-        CubDebugExit(cudaMemset(d_unique_out,   0, sizeof(T) * num_items));
-    if (RLE_METHOD == NON_TRIVIAL)
-        CubDebugExit(cudaMemset(d_offsets_out,  0, sizeof(OffsetT) * num_items));
-    CubDebugExit(cudaMemset(d_lengths_out,  0, sizeof(LengthT) * num_items));
-    CubDebugExit(cudaMemset(d_num_runs,     0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare0 = 0;
-    int compare1 = 0;
-    int compare2 = 0;
-    int compare3 = 0;
-
-    if (RLE_METHOD == RLE)
-    {
-        compare0 = CompareDeviceResults(h_unique_reference, d_unique_out, num_runs, true, g_verbose);
-        printf("\t Keys %s\n", compare0 ? "FAIL" : "PASS");
-    }
-
-    if (RLE_METHOD != RLE)
-    {
-        compare1 = CompareDeviceResults(h_offsets_reference, d_offsets_out, num_runs, true, g_verbose);
-        printf("\t Offsets %s\n", compare1 ? "FAIL" : "PASS");
-    }
-
-    if (RLE_METHOD != CSR)
-    {
-        compare2 = CompareDeviceResults(h_lengths_reference, d_lengths_out, num_runs, true, g_verbose);
-        printf("\t Lengths %s\n", compare2 ? "FAIL" : "PASS");
-    }
-
-    compare3 = CompareDeviceResults(&num_runs, d_num_runs, 1, true, g_verbose);
-    printf("\t Count %s\n", compare3 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<RLE_METHOD>(), Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_offsets_out, d_lengths_out, d_num_runs, equality_op, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int bytes_moved = (num_items * sizeof(T)) + (num_runs * (sizeof(OffsetT) + sizeof(LengthT)));
-        float giga_bandwidth = float(bytes_moved) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s", avg_millis, giga_rate, giga_bandwidth);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_unique_out) CubDebugExit(g_allocator.DeviceFree(d_unique_out));
-    if (d_offsets_out) CubDebugExit(g_allocator.DeviceFree(d_offsets_out));
-    if (d_lengths_out) CubDebugExit(g_allocator.DeviceFree(d_lengths_out));
-    if (d_num_runs) CubDebugExit(g_allocator.DeviceFree(d_num_runs));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare0 | compare1 | compare2 | compare3);
-}
-
-
-/**
- * Test DeviceRunLengthEncode on pointer type
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment)
-{
-    // Allocate host arrays
-    T*      h_in                    = new T[num_items];
-    T*      h_unique_reference      = new T[num_items];
-    OffsetT* h_offsets_reference     = new OffsetT[num_items];
-    LengthT* h_lengths_reference     = new LengthT[num_items];
-
-    for (int i = 0; i < num_items; ++i)
-        InitValue(INTEGER_SEED, h_offsets_reference[i], 1);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    Initialize(entropy_reduction, h_in, num_items, max_segment);
-
-    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
-
-    printf("\nPointer %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}, max_segment %d, entropy_reduction %d\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_runs, float(num_items) / num_runs,
-        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name(),
-        max_segment, entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T* d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<RLE_METHOD, BACKEND>(d_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_unique_reference) delete[] h_unique_reference;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestIterator(
-    int             num_items,
-    Int2Type<true>  is_primitive)
-{
-    // Allocate host arrays
-    T* h_unique_reference       = new T[num_items];
-    OffsetT* h_offsets_reference = new OffsetT[num_items];
-    LengthT* h_lengths_reference = new LengthT[num_items];
-
-    T one_val;
-    InitValue(INTEGER_SEED, one_val, 1);
-    ConstantInputIterator<T, int> h_in(one_val);
-
-    // Initialize problem and solution
-    Equality equality_op;
-    int num_runs = Solve<RLE_METHOD>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_items);
-
-    printf("\nIterator %s cub::%s on %d items, %d segments (avg run length %.3f), {%s key, %s offset, %s length}\n",
-        (RLE_METHOD == RLE) ? "DeviceReduce::RunLengthEncode" : (RLE_METHOD == NON_TRIVIAL) ? "DeviceRunLengthEncode::NonTrivialRuns" : "Other",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_runs, float(num_items) / num_runs,
-        typeid(T).name(), typeid(OffsetT).name(), typeid(LengthT).name());
-    fflush(stdout);
-
-    // Run Test
-    Test<RLE_METHOD, BACKEND>(h_in, h_unique_reference, h_offsets_reference, h_lengths_reference, equality_op, num_runs, num_items);
-
-    // Cleanup
-    if (h_unique_reference) delete[] h_unique_reference;
-    if (h_offsets_reference) delete[] h_offsets_reference;
-    if (h_lengths_reference) delete[] h_lengths_reference;
-}
-
-
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestIterator(
-    int             num_items,
-    Int2Type<false> is_primitive)
-{}
-
-
-/**
- * Test different gen modes
- */
-template <
-    RleMethod       RLE_METHOD,
-    Backend         BACKEND,
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void Test(
-    int             num_items)
-{
-    // Test iterator (one run)
-    TestIterator<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, Int2Type<Traits<T>::PRIMITIVE>());
-
-    // num_items runs
-    TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, 1);
-
-    // Evaluate different run lengths
-    for (int max_segment = 3; max_segment < CUB_MIN(num_items, (unsigned short) -1); max_segment *= 3)
-    {
-        // Uniform selection run length
-        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 0, max_segment);
-
-        // Reduced-entropy run length
-        TestPointer<RLE_METHOD, BACKEND, T, OffsetT, LengthT>(num_items, 4, max_segment);
-    }
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestDispatch(
-    int             num_items)
-{
-    Test<RLE,           CUB, T, OffsetT, LengthT>(num_items);
-    Test<NON_TRIVIAL,   CUB, T, OffsetT, LengthT>(num_items);
-
-#ifdef CUB_CDP
-    Test<RLE,           CDP, T, OffsetT, LengthT>(num_items);
-    Test<NON_TRIVIAL,   CDP, T, OffsetT, LengthT>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename        T,
-    typename        OffsetT,
-    typename        LengthT>
-void TestSize(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestDispatch<T, OffsetT, LengthT>(0);
-        TestDispatch<T, OffsetT, LengthT>(1);
-        TestDispatch<T, OffsetT, LengthT>(100);
-        TestDispatch<T, OffsetT, LengthT>(10000);
-        TestDispatch<T, OffsetT, LengthT>(1000000);
-
-        // Randomly select problem size between 1:10,000,000
-        unsigned int max_int = (unsigned int) -1;
-        for (int i = 0; i < 10; ++i)
-        {
-            unsigned int num_items;
-            RandomBits(num_items);
-            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-            num_items = CUB_MAX(1, num_items);
-            TestDispatch<T, OffsetT, LengthT>(num_items);
-        }
-    }
-    else
-    {
-        TestDispatch<T, OffsetT, LengthT>(num_items);
-    }
-
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int max_segment              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", max_segment);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    printf("\n");
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestPointer<NON_TRIVIAL,    CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestIterator<RLE,           CUB, float, int, int>(  num_items, Int2Type<Traits<float>::PRIMITIVE>());
-
-
-#elif defined(QUICK_TEST)
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<RLE,            CUB, int, int, int>(    num_items, entropy_reduction, max_segment);
-    TestPointer<RLE,            THRUST, int, int, int>(    num_items, entropy_reduction, max_segment);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        TestSize<char,          int, int>(num_items);
-        TestSize<short,         int, int>(num_items);
-        TestSize<int,           int, int>(num_items);
-        TestSize<long,          int, int>(num_items);
-        TestSize<long long,     int, int>(num_items);
-        TestSize<float,         int, int>(num_items);
-        TestSize<double,        int, int>(num_items);
-
-        TestSize<uchar2,        int, int>(num_items);
-        TestSize<uint2,         int, int>(num_items);
-        TestSize<uint3,         int, int>(num_items);
-        TestSize<uint4,         int, int>(num_items);
-        TestSize<ulonglong4,    int, int>(num_items);
-        TestSize<TestFoo,       int, int>(num_items);
-        TestSize<TestBar,       int, int>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_device_scan.cu b/external/cub/test/test_device_scan.cu
deleted file mode 100644
index 63c80682ced..00000000000
--- a/external/cub/test/test_device_scan.cu
+++ /dev/null
@@ -1,1015 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/scan.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/discard_output_iterator.cuh>
-#include <cub/device/device_scan.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose           = false;
-int                     g_timing_iterations = 0;
-int                     g_repeat            = 0;
-double                  g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceScan entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to exclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, initial_value, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to exclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to inclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to inclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>       dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to exclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, initial_value, scan_op);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to exclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::exclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to inclusive scan entrypoint
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, scan_op);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to inclusive sum entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<THRUST>    dispatch_to,
-    Int2Type<true>      is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    Sum                 scan_op,
-    NullType            initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            thrust::inclusive_scan(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceScan
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t              temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    bool                debug_synchronous)
-{
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(
-        Int2Type<CUB>(),
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        debug_synchronous);
-
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename IsPrimitiveT, typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename InitialValueT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>       dispatch_to,
-    IsPrimitiveT        is_primitive,
-    int                 timing_timing_iterations,
-    size_t              *d_temp_storage_bytes,
-    cudaError_t         *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t&             temp_storage_bytes,
-    InputIteratorT      d_in,
-    OutputIteratorT     d_out,
-    ScanOpT             scan_op,
-    InitialValueT       initial_value,
-    OffsetT             num_items,
-    cudaStream_t        stream,
-    bool                debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(
-        is_primitive,
-        timing_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    GenMode      gen_mode,
-    T            *h_in,
-    int          num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-/**
- * Solve exclusive-scan problem
- */
-template <
-    typename        InputIteratorT,
-    typename        OutputT,
-    typename        ScanOpT>
-void Solve(
-    InputIteratorT  h_in,
-    OutputT         *h_reference,
-    int             num_items,
-    ScanOpT         scan_op,
-    OutputT         initial_value)
-{
-    if (num_items > 0)
-    {
-        OutputT val         = h_in[0];
-        h_reference[0]      = initial_value;
-        OutputT inclusive   = scan_op(initial_value, val);
-
-        for (int i = 1; i < num_items; ++i)
-        {
-            val = h_in[i];
-            h_reference[i] = inclusive;
-            inclusive = scan_op(inclusive, val);
-        }
-    }
-}
-
-
-/**
- * Solve inclusive-scan problem
- */
-template <
-    typename        InputIteratorT,
-    typename        OutputT,
-    typename        ScanOpT>
-void Solve(
-    InputIteratorT  h_in,
-    OutputT         *h_reference,
-    int             num_items,
-    ScanOpT         scan_op,
-    NullType)
-{
-    if (num_items > 0)
-    {
-        OutputT inclusive   = h_in[0];
-        h_reference[0]      = inclusive;
-
-        for (int i = 1; i < num_items; ++i)
-        {
-            OutputT val = h_in[i];
-            inclusive = scan_op(inclusive, val);
-            h_reference[i] = inclusive;
-        }
-    }
-}
-
-
-/**
- * Test DeviceScan for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            OutputT,
-    typename            ScanOpT,
-    typename            InitialValueT>
-void Test(
-    DeviceInputIteratorT    d_in,
-    OutputT                 *h_reference,
-    int                     num_items,
-    ScanOpT                 scan_op,
-    InitialValueT           initial_value)
-{
-    typedef typename std::iterator_traits<DeviceInputIteratorT>::value_type InputT;
-
-    // Allocate device output array
-    OutputT *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(OutputT) * num_items));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,   sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(
-        Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        1,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output array
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(OutputT) * num_items));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(
-        Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        1,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        true));
-
-    // Check for correctness (and display results, if specified)
-    int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
-    printf("\t%s", compare ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(),
-        Int2Type<Traits<OutputT>::PRIMITIVE>(),
-        g_timing_iterations,
-        d_temp_storage_bytes,
-        d_cdp_error,
-        d_temp_storage,
-        temp_storage_bytes,
-        d_in,
-        d_out,
-        scan_op,
-        initial_value,
-        num_items,
-        0,
-        false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis = elapsed_millis / g_timing_iterations;
-        float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth = giga_rate * (sizeof(InputT) + sizeof(OutputT));
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-
-    printf("\n\n");
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare);
-}
-
-
-/**
- * Test DeviceScan on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void TestPointer(
-    int             num_items,
-    GenMode         gen_mode,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    printf("\nPointer %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes) , gen-mode %s\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
-        num_items,
-        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT),
-        (gen_mode == RANDOM) ? "RANDOM" : (gen_mode == INTEGER_SEED) ? "SEQUENTIAL" : "HOMOGENOUS");
-    fflush(stdout);
-
-    // Allocate host arrays
-    InputT*     h_in        = new InputT[num_items];
-    OutputT*    h_reference = new OutputT[num_items];
-
-    // Initialize problem and solution
-    Initialize(gen_mode, h_in, num_items);
-    Solve(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Allocate problem device arrays
-    InputT *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(InputT) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(InputT) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test DeviceScan on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void TestIterator(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    printf("\nIterator %s %s cub::DeviceScan::%s %d items, %s->%s (%d->%d bytes)\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        (Equals<ScanOpT, Sum>::VALUE) ? "Sum" : "Scan",
-        num_items,
-        typeid(InputT).name(), typeid(OutputT).name(), (int) sizeof(InputT), (int) sizeof(OutputT));
-    fflush(stdout);
-
-    // Use a constant iterator as the input
-    InputT val = InputT();
-    ConstantInputIterator<InputT, int> h_in(val);
-
-    // Allocate host arrays
-    OutputT*  h_reference = new OutputT[num_items];
-
-    // Initialize problem and solution
-    Solve(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Run Test
-    Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void Test(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    TestPointer<BACKEND, InputT, OutputT>(  num_items, UNIFORM, scan_op, initial_value);
-    TestPointer<BACKEND, InputT, OutputT>(  num_items, RANDOM,  scan_op, initial_value);
-    TestIterator<BACKEND, InputT, OutputT>( num_items, scan_op, initial_value);
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        InputT,
-    typename        OutputT,
-    typename        ScanOpT,
-    typename        InitialValueT>
-void Test(
-    int             num_items,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    Test<CUB, InputT, OutputT>(num_items, scan_op, initial_value);
-#ifdef CUB_CDP
-    Test<CDP, InputT, OutputT>(num_items, scan_op, initial_value);
-#endif
-}
-
-
-/**
- * Test different operators
- */
-template <typename InputT, typename OutputT>
-void TestOp(
-    int             num_items,
-    OutputT         identity,
-    OutputT         initial_value)
-{
-    // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values)
-    Test<InputT, OutputT>(num_items, cub::Sum(), identity);
-    Test<InputT, OutputT>(num_items, cub::Max(), identity);
-
-    // Exclusive (non-specialized, so we can test initial-value)
-    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Sum>(cub::Sum()), initial_value);
-    Test<InputT, OutputT>(num_items, WrapperFunctor<cub::Max>(cub::Max()), initial_value);
-
-    // Inclusive (no initial value)
-    Test<InputT, OutputT>(num_items, cub::Sum(), NullType());
-    Test<InputT, OutputT>(num_items, cub::Max(), NullType());
-}
-
-
-/**
- * Test different input sizes
- */
-template <
-    typename InputT,
-    typename OutputT>
-void TestSize(
-    int     num_items,
-    OutputT identity,
-    OutputT initial_value)
-{
-    if (num_items < 0)
-    {
-        TestOp<InputT>(0,        identity, initial_value);
-        TestOp<InputT>(1,        identity, initial_value);
-        TestOp<InputT>(100,      identity, initial_value);
-        TestOp<InputT>(10000,    identity, initial_value);
-        TestOp<InputT>(1000000,  identity, initial_value);
-
-        // Randomly select problem size between 1:10,000,000
-        unsigned int max_int = (unsigned int) -1;
-        for (int i = 0; i < 10; ++i)
-        {
-            unsigned int num_items;
-            RandomBits(num_items);
-            num_items = (unsigned int) ((double(num_items) * double(10000000)) / double(max_int));
-            num_items = CUB_MAX(1, num_items);
-            TestOp<InputT>(num_items,  identity, initial_value);
-        }
-    }
-    else
-    {
-        TestOp<InputT>(num_items, identity, initial_value);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items = -1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, char, int>(         num_items    , UNIFORM, Sum(), (int) (0));
-    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    TestPointer<CUB, char, char>(        num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
-    TestPointer<THRUST, char, char>(     num_items * ((sm_version <= 130) ? 1 : 4), UNIFORM, Sum(), char(0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, short, short>(       num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
-    TestPointer<THRUST, short, short>(    num_items * ((sm_version <= 130) ? 1 : 2), UNIFORM, Sum(), short(0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, int, int>(         num_items    , UNIFORM, Sum(), (int) (0));
-    TestPointer<THRUST, int, int>(      num_items    , UNIFORM, Sum(), (int) (0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, long long, long long>(   num_items / 2, UNIFORM, Sum(), (long long) (0));
-    TestPointer<THRUST, long long, long long>(num_items / 2, UNIFORM, Sum(), (long long) (0));
-
-    printf("----------------------------\n");
-    TestPointer<CUB, TestBar, TestBar>(     num_items / 4, UNIFORM, Sum(), TestBar());
-    TestPointer<THRUST, TestBar, TestBar>(  num_items / 4, UNIFORM, Sum(), TestBar());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input+output data types
-        TestSize<unsigned char>(num_items,      (int) 0, (int) 99);
-
-        // Test same intput+output data types
-        TestSize<unsigned char>(num_items,      (unsigned char) 0,      (unsigned char) 99);
-        TestSize<char>(num_items,               (char) 0,               (char) 99);
-        TestSize<unsigned short>(num_items,     (unsigned short) 0,     (unsigned short)99);
-        TestSize<unsigned int>(num_items,       (unsigned int) 0,       (unsigned int) 99);
-        TestSize<unsigned long long>(num_items, (unsigned long long) 0, (unsigned long long) 99);
-
-        TestSize<uchar2>(num_items,     make_uchar2(0, 0),              make_uchar2(17, 21));
-        TestSize<char2>(num_items,      make_char2(0, 0),               make_char2(17, 21));
-        TestSize<ushort2>(num_items,    make_ushort2(0, 0),             make_ushort2(17, 21));
-        TestSize<uint2>(num_items,      make_uint2(0, 0),               make_uint2(17, 21));
-        TestSize<ulonglong2>(num_items, make_ulonglong2(0, 0),          make_ulonglong2(17, 21));
-        TestSize<uchar4>(num_items,     make_uchar4(0, 0, 0, 0),        make_uchar4(17, 21, 32, 85));
-        TestSize<char4>(num_items,      make_char4(0, 0, 0, 0),         make_char4(17, 21, 32, 85));
-
-        TestSize<ushort4>(num_items,    make_ushort4(0, 0, 0, 0),       make_ushort4(17, 21, 32, 85));
-        TestSize<uint4>(num_items,      make_uint4(0, 0, 0, 0),         make_uint4(17, 21, 32, 85));
-        TestSize<ulonglong4>(num_items, make_ulonglong4(0, 0, 0, 0),    make_ulonglong4(17, 21, 32, 85));
-
-        TestSize<TestFoo>(num_items,
-            TestFoo::MakeTestFoo(0, 0, 0, 0),
-            TestFoo::MakeTestFoo(1ll << 63, 1 << 31, short(1 << 15), char(1 << 7)));
-
-        TestSize<TestBar>(num_items,
-            TestBar(0, 0),
-            TestBar(1ll << 63, 1 << 31));
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_device_select_if.cu b/external/cub/test/test_device_select_if.cu
deleted file mode 100644
index a02b020f336..00000000000
--- a/external/cub/test/test_device_select_if.cu
+++ /dev/null
@@ -1,1039 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceSelect::If and DevicePartition::If utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/copy.h>
-#include <thrust/partition.h>
-#include <thrust/iterator/reverse_iterator.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/device/device_select.cuh>
-#include <cub/device/device_partition.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose               = false;
-int                     g_timing_iterations     = 0;
-int                     g_repeat                = 0;
-float                   g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-// Selection functor type
-template <typename T>
-struct LessThan
-{
-    T compare;
-
-    __host__ __device__ __forceinline__
-    LessThan(T compare) : compare(compare) {}
-
-    __host__ __device__ __forceinline__
-    bool operator()(const T &a) const {
-        return (a < compare);
-    }
-};
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSelect entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to select if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to partition if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to select flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<false>             partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-/**
- * Dispatch to partition flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<true>              partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-/**
- * Dispatch to select if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT>         d_out_wrapper_end;
-        thrust::device_ptr<InputT>          d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>         d_out_wrapper(d_out);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to partition if entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<false>             is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
-
-        thrust::device_ptr<InputT>       d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>       d_out_wrapper(d_out);
-
-        ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::partition_copy(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                d_out_wrapper,
-                d_out_unselected,
-                select_op);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to select flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<false>             is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The flag type
-    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT>     d_out_wrapper_end;
-        thrust::device_ptr<InputT>      d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT>     d_out_wrapper(d_out);
-        thrust::device_ptr<FlagT>       d_flags_wrapper(d_flags);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, CastOp<bool>());
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Dispatch to partition flagged entrypoint
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    Int2Type<true>              is_flagged,
-    Int2Type<true>              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The flag type
-    typedef typename std::iterator_traits<FlagIteratorT>::value_type FlagT;
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    typedef thrust::reverse_iterator<thrust::device_ptr<OutputT> > ReverseOutputIteratorT;
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::pair<thrust::device_ptr<OutputT>, ReverseOutputIteratorT> d_out_wrapper_end;
-
-        thrust::device_ptr<InputT>  d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        thrust::device_ptr<FlagT>   d_flags_wrapper(d_flags);
-        ReverseOutputIteratorT      d_out_unselected(d_out_wrapper + num_items);
-
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::partition_copy(
-                d_in_wrapper,
-                d_in_wrapper + num_items,
-                d_flags_wrapper,
-                d_out_wrapper,
-                d_out_unselected,
-                CastOp<bool>());
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-    }
-
-    return cudaSuccess;
-}
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-__global__ void CnpDispatchKernel(
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT, typename IsFlaggedTag, typename IsPartitionTag>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    IsFlaggedTag                is_flagged,
-    IsPartitionTag              is_partition,
-    int                         timing_timing_iterations,
-    size_t*                     d_temp_storage_bytes,
-    cudaError_t*                d_cdp_error,
-
-    void*                       d_temp_storage,
-    size_t&                     temp_storage_bytes,
-    InputIteratorT              d_in,
-    FlagIteratorT               d_flags,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    SelectOpT                   select_op,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    T*  h_in,
-    int num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        // Initialize each item to a randomly selected value from [0..126]
-        unsigned int value;
-        RandomBits(value, 0, 0, 7);
-        if (value == 127)
-            value = 126;
-        InitValue(INTEGER_SEED, h_in[i], value);
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve selection problem (and set corresponding flags)
- */
-template <
-    typename        InputIteratorT,
-    typename        FlagIteratorT,
-    typename        SelectOpT,
-    typename        T>
-int Solve(
-    InputIteratorT  h_in,
-    SelectOpT       select_op,
-    T*              h_reference,
-    FlagIteratorT   h_flags,
-    int             num_items)
-{
-    int num_selected = 0;
-    for (int i = 0; i < num_items; ++i)
-    {
-        if ((h_flags[i] = select_op(h_in[i])))
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-        else
-        {
-            h_reference[num_items - (i - num_selected) - 1] = h_in[i];
-        }
-    }
-
-    return num_selected;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    bool                IS_FLAGGED,
-    bool                IS_PARTITION,
-    typename            DeviceInputIteratorT,
-    typename            FlagT,
-    typename            SelectOpT,
-    typename            T>
-void Test(
-    DeviceInputIteratorT    d_in,
-    FlagT*                  h_flags,
-    SelectOpT               select_op,
-    T*                      h_reference,
-    int                     num_selected,
-    int                     num_items)
-{
-    // Allocate device flags, output, and num-selected
-    FlagT*      d_flags = NULL;
-    T*          d_out = NULL;
-    int*        d_num_selected_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t*         d_temp_storage_bytes = NULL;
-    cudaError_t*    d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
-    d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Copy flags and clear device output array
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
-    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), 1, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = (IS_PARTITION) ?
-        CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) :
-        CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s\n", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s\n", compare2 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), Int2Type<IS_FLAGGED>(), Int2Type<IS_PARTITION>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float   avg_millis          = elapsed_millis / g_timing_iterations;
-        float   giga_rate           = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        int     num_output_items    = (IS_PARTITION) ? num_items : num_selected;
-        int     num_flag_items      = (IS_FLAGGED) ? num_items : 0;
-        size_t  num_bytes           = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items;
-        float   giga_bandwidth      = float(num_bytes) / avg_millis / 1000.0f / 1000.0f;
-
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2);
-}
-
-
-/**
- * Test on pointer type
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void TestPointer(
-    int             num_items,
-    float           select_ratio)
-{
-    typedef char FlagT;
-
-    // Allocate host arrays
-    T*      h_in        = new T[num_items];
-    FlagT*  h_flags     = new FlagT[num_items];
-    T*      h_reference = new T[num_items];
-
-    // Initialize input
-    Initialize(h_in, num_items);
-
-    // Select a comparison value that is select_ratio through the space of [0,127]
-    T compare;
-    if (select_ratio <= 0.0)
-        InitValue(INTEGER_SEED, compare, 0);        // select none
-    else if (select_ratio >= 1.0)
-        InitValue(INTEGER_SEED, compare, 127);      // select all
-    else
-        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
-
-    LessThan<T> select_op(compare);
-    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
-
-    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
-    printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
-        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
-        (IS_FLAGGED) ? "Flagged" : "If",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T *d_in = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(d_in, h_flags, select_op, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_flags) delete[] h_flags;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test on iterator type
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void TestIterator(
-    int             num_items,
-    float           select_ratio)
-{
-    typedef char FlagT;
-
-    // Allocate host arrays
-    T*      h_reference = new T[num_items];
-    FlagT*  h_flags = new FlagT[num_items];
-
-    // Use counting iterator as the input
-    CountingInputIterator<T, int> h_in(0);
-
-    // Select a comparison value that is select_ratio through the space of [0,127]
-    T compare;
-    if (select_ratio <= 0.0)
-        InitValue(INTEGER_SEED, compare, 0);        // select none
-    else if (select_ratio >= 1.0)
-        InitValue(INTEGER_SEED, compare, 127);      // select all
-    else
-        InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio)));
-
-    LessThan<T> select_op(compare);
-    int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items);
-
-    if (g_verbose) std::cout << "\nComparison item: " << compare << "\n";
-    printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n",
-        (IS_PARTITION) ? "DevicePartition" : "DeviceSelect",
-        (IS_FLAGGED) ? "Flagged" : "If",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T));
-    fflush(stdout);
-
-    // Run Test
-    Test<BACKEND, IS_FLAGGED, IS_PARTITION>(h_in, h_flags, select_op, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-    if (h_flags) delete[] h_flags;
-}
-
-
-/**
- * Test different selection ratios
- */
-template <
-    Backend         BACKEND,
-    bool            IS_FLAGGED,
-    bool            IS_PARTITION,
-    typename        T>
-void Test(
-    int             num_items)
-{
-    for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f)
-    {
-        TestPointer<BACKEND, IS_FLAGGED, IS_PARTITION, T>(num_items, select_ratio);
-    }
-}
-
-
-/**
- * Test (select vs. partition) and (flagged vs. functor)
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestMethod(
-    int             num_items)
-{
-    // Functor
-    Test<BACKEND, false, false, T>(num_items);
-    Test<BACKEND, false, true, T>(num_items);
-
-    // Flagged
-    Test<BACKEND, true, false, T>(num_items);
-    Test<BACKEND, true, true, T>(num_items);
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T>
-void TestOp(
-    int             num_items)
-{
-    TestMethod<CUB, T>(num_items);
-#ifdef CUB_CDP
-    TestMethod<CDP, T>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <typename T>
-void Test(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestOp<T>(0);
-        TestOp<T>(1);
-        TestOp<T>(100);
-        TestOp<T>(10000);
-        TestOp<T>(1000000);
-    }
-    else
-    {
-        TestOp<T>(num_items);
-    }
-}
-
-/**
- * Test select/partition on pointer types
- */
-template <typename T>
-void ComparePointer(
-    int             num_items,
-    float           select_ratio)
-{
-    printf("-- Select-if ----------------------------\n");
-    TestPointer<CUB, false, false, T>(num_items, select_ratio);
-    TestPointer<THRUST, false, false, T>(num_items, select_ratio);
-
-    printf("-- Partition-if ----------------------------\n");
-    TestPointer<CUB, false, true, T>(num_items, select_ratio);
-    TestPointer<THRUST, false, true, T>(num_items, select_ratio);
-
-    printf("-- Select-flagged ----------------------------\n");
-    TestPointer<CUB, true, false, T>(num_items, select_ratio);
-    TestPointer<THRUST, true, false, T>(num_items, select_ratio);
-
-    printf("-- Partition-flagged ----------------------------\n");
-    TestPointer<CUB, true, true, T>(num_items, select_ratio);
-    TestPointer<THRUST, true, true, T>(num_items, select_ratio);
-
-}
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    float select_ratio      = 0.5;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("ratio", select_ratio);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--ratio=<selection ratio, default 0.5>] "
-            "[--repeat=<repetitions of entire test suite>] "
-            "[--v] "
-            "[--cdp] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Select-if ----------------------------\n");
-    TestPointer<CUB, false, false, int>(num_items, select_ratio);
-
-    printf("-- Partition-if ----------------------------\n");
-    TestPointer<CUB, false, true, int>(num_items, select_ratio);
-
-    printf("-- Select-flagged ----------------------------\n");
-    TestPointer<CUB, true, false, int>(num_items, select_ratio);
-
-    printf("-- Partition-flagged ----------------------------\n");
-    TestPointer<CUB, true, true, int>(num_items, select_ratio);
-
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Iterator ----------------------------\n");
-    TestIterator<CUB, false, false, int>(num_items, select_ratio);
-
-    ComparePointer<char>(       num_items * ((sm_version <= 130) ? 1 : 4),  select_ratio);
-    ComparePointer<short>(      num_items * ((sm_version <= 130) ? 1 : 2),  select_ratio);
-    ComparePointer<int>(        num_items,                                  select_ratio);
-    ComparePointer<long long>(  num_items / 2,                              select_ratio);
-    ComparePointer<TestFoo>(    num_items / 4,                              select_ratio);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        Test<unsigned char>(num_items);
-        Test<unsigned short>(num_items);
-        Test<unsigned int>(num_items);
-        Test<unsigned long long>(num_items);
-
-        Test<uchar2>(num_items);
-        Test<ushort2>(num_items);
-        Test<uint2>(num_items);
-        Test<ulonglong2>(num_items);
-
-        Test<uchar4>(num_items);
-        Test<ushort4>(num_items);
-        Test<uint4>(num_items);
-        Test<ulonglong4>(num_items);
-
-        Test<TestFoo>(num_items);
-        Test<TestBar>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_device_select_unique.cu b/external/cub/test/test_device_select_unique.cu
deleted file mode 100644
index bd40a5c0eb9..00000000000
--- a/external/cub/test/test_device_select_unique.cu
+++ /dev/null
@@ -1,651 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of DeviceSelect::Unique utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <thrust/device_ptr.h>
-#include <thrust/unique.h>
-
-#include <cub/util_allocator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/device/device_select.cuh>
-
-#include <thrust/device_ptr.h>
-#include <thrust/unique.h>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose               = false;
-int                     g_timing_iterations     = 0;
-int                     g_repeat                = 0;
-float                   g_device_giga_bandwidth;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-//---------------------------------------------------------------------
-// Dispatch to different CUB DeviceSelect entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to unique entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t Dispatch(
-    Int2Type<CUB>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    cudaError_t error = cudaSuccess;
-    for (int i = 0; i < timing_timing_iterations; ++i)
-    {
-        error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
-    }
-    return error;
-}
-
-
-//---------------------------------------------------------------------
-// Dispatch to different Thrust entrypoints
-//---------------------------------------------------------------------
-
-
-/**
- * Dispatch to unique entrypoint
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__host__ __forceinline__
-cudaError_t Dispatch(
-    Int2Type<THRUST>            dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void                        *d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT             d_out,
-    NumSelectedIteratorT        d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    if (d_temp_storage == 0)
-    {
-        temp_storage_bytes = 1;
-    }
-    else
-    {
-        thrust::device_ptr<OutputT> d_out_wrapper_end;
-        thrust::device_ptr<InputT> d_in_wrapper(d_in);
-        thrust::device_ptr<OutputT> d_out_wrapper(d_out);
-        for (int i = 0; i < timing_timing_iterations; ++i)
-        {
-            d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
-        }
-
-        OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
-        CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));
-
-    }
-
-    return cudaSuccess;
-}
-
-
-
-//---------------------------------------------------------------------
-// CUDA Nested Parallelism Test Kernel
-//---------------------------------------------------------------------
-
-/**
- * Simple wrapper kernel to invoke DeviceSelect
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-__global__ void CnpDispatchKernel(
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    bool                        debug_synchronous)
-{
-
-#ifndef CUB_CDP
-    *d_cdp_error = cudaErrorNotSupported;
-#else
-    *d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
-    *d_temp_storage_bytes = temp_storage_bytes;
-#endif
-}
-
-
-/**
- * Dispatch to CDP kernel
- */
-template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
-cudaError_t Dispatch(
-    Int2Type<CDP>               dispatch_to,
-    int                         timing_timing_iterations,
-    size_t                      *d_temp_storage_bytes,
-    cudaError_t                 *d_cdp_error,
-
-    void*               d_temp_storage,
-    size_t                      &temp_storage_bytes,
-    InputIteratorT              d_in,
-    OutputIteratorT              d_out,
-    NumSelectedIteratorT         d_num_selected_out,
-    OffsetT                     num_items,
-    cudaStream_t                stream,
-    bool                        debug_synchronous)
-{
-    // Invoke kernel to invoke device-side dispatch
-    CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
-        d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);
-
-    // Copy out temp_storage_bytes
-    CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));
-
-    // Copy out error
-    cudaError_t retval;
-    CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Test generation
-//---------------------------------------------------------------------
-
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    int         entropy_reduction,
-    T           *h_in,
-    int         num_items,
-    int         max_segment)
-{
-    unsigned int max_int = (unsigned int) -1;
-
-    int key = 0;
-    int i = 0;
-    while (i < num_items)
-    {
-        // Select number of repeating occurrences for the current run
-        int repeat;
-        if (max_segment < 0)
-        {
-            repeat = num_items;
-        }
-        else if (max_segment < 2)
-        {
-            repeat = 1;
-        }
-        else
-        {
-            RandomBits(repeat, entropy_reduction);
-            repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
-            repeat = CUB_MAX(1, repeat);
-        }
-
-        int j = i;
-        while (j < CUB_MIN(i + repeat, num_items))
-        {
-            InitValue(INTEGER_SEED, h_in[j], key);
-            j++;
-        }
-
-        i = j;
-        key++;
-    }
-
-    if (g_verbose)
-    {
-        printf("Input:\n");
-        DisplayResults(h_in, num_items);
-        printf("\n\n");
-    }
-}
-
-
-/**
- * Solve unique problem
- */
-template <
-    typename        InputIteratorT,
-    typename        T>
-int Solve(
-    InputIteratorT  h_in,
-    T               *h_reference,
-    int             num_items)
-{
-    int num_selected = 0;
-    if (num_items > 0)
-    {
-        h_reference[num_selected] = h_in[0];
-        num_selected++;
-    }
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        if (h_in[i] != h_in[i - 1])
-        {
-            h_reference[num_selected] = h_in[i];
-            num_selected++;
-        }
-    }
-
-    return num_selected;
-}
-
-
-
-/**
- * Test DeviceSelect for a given problem input
- */
-template <
-    Backend             BACKEND,
-    typename            DeviceInputIteratorT,
-    typename            T>
-void Test(
-    DeviceInputIteratorT d_in,
-    T                   *h_reference,
-    int                 num_selected,
-    int                 num_items)
-{
-    // Allocate device output array and num selected
-    T       *d_out            = NULL;
-    int     *d_num_selected_out   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));
-
-    // Allocate CDP device arrays
-    size_t          *d_temp_storage_bytes = NULL;
-    cudaError_t     *d_cdp_error = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes,  sizeof(size_t) * 1));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error,           sizeof(cudaError_t) * 1));
-
-    // Allocate temporary storage
-    void            *d_temp_storage = NULL;
-    size_t          temp_storage_bytes = 0;
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
-    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-    // Clear device output array
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
-    CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));
-
-    // Run warmup/correctness iteration
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
-
-    // Check for correctness (and display results, if specified)
-    int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
-    printf("\t Data %s ", compare1 ? "FAIL" : "PASS");
-
-    int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
-    printf("\t Count %s ", compare2 ? "FAIL" : "PASS");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Performance
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
-    gpu_timer.Stop();
-    float elapsed_millis = gpu_timer.ElapsedMillis();
-
-    // Display performance
-    if (g_timing_iterations > 0)
-    {
-        float avg_millis        = elapsed_millis / g_timing_iterations;
-        float giga_rate         = float(num_items) / avg_millis / 1000.0f / 1000.0f;
-        float giga_bandwidth    = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
-        printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
-    }
-    printf("\n\n");
-
-    // Flush any stdout/stderr
-    fflush(stdout);
-    fflush(stderr);
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
-    if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
-    if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
-    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-
-    // Correctness asserts
-    AssertEquals(0, compare1 | compare2);
-}
-
-
-/**
- * Test DeviceSelect on pointer type
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestPointer(
-    int             num_items,
-    int             entropy_reduction,
-    int             max_segment)
-{
-    // Allocate host arrays
-    T*  h_in        = new T[num_items];
-    T*  h_reference = new T[num_items];
-
-    // Initialize problem and solution
-    Initialize(entropy_reduction, h_in, num_items, max_segment);
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_items) / num_selected,
-        typeid(T).name(),
-        (int) sizeof(T),
-        entropy_reduction);
-    fflush(stdout);
-
-    // Allocate problem device arrays
-    T *d_in = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items));
-
-    // Initialize device input
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));
-
-    // Run Test
-    Test<BACKEND>(d_in, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-}
-
-
-/**
- * Test DeviceSelect on iterator type
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void TestIterator(
-    int             num_items)
-{
-    // Use a counting iterator as the input
-    CountingInputIterator<T, int> h_in(0);
-
-    // Allocate host arrays
-    T*  h_reference = new T[num_items];
-
-    // Initialize problem and solution
-    int num_selected = Solve(h_in, h_reference, num_items);
-
-    printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
-        (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
-        num_items, num_selected, float(num_items) / num_selected,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run Test
-    Test<BACKEND>(h_in, h_reference, num_selected, num_items);
-
-    // Cleanup
-    if (h_reference) delete[] h_reference;
-}
-
-
-/**
- * Test different gen modes
- */
-template <
-    Backend         BACKEND,
-    typename        T>
-void Test(
-    int             num_items)
-{
-    for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
-    {
-        TestPointer<BACKEND, T>(num_items, 0, max_segment);
-        TestPointer<BACKEND, T>(num_items, 2, max_segment);
-        TestPointer<BACKEND, T>(num_items, 7, max_segment);
-    }
-}
-
-
-/**
- * Test different dispatch
- */
-template <
-    typename        T>
-void TestOp(
-    int             num_items)
-{
-    Test<CUB, T>(num_items);
-#ifdef CUB_CDP
-    Test<CDP, T>(num_items);
-#endif
-}
-
-
-/**
- * Test different input sizes
- */
-template <typename T>
-void Test(
-    int             num_items)
-{
-    if (num_items < 0)
-    {
-        TestOp<T>(0);
-        TestOp<T>(1);
-        TestOp<T>(100);
-        TestOp<T>(10000);
-        TestOp<T>(1000000);
-    }
-    else
-    {
-        TestOp<T>(num_items);
-    }
-}
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    int num_items           = -1;
-    int entropy_reduction   = 0;
-    int maxseg              = 1000;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("n", num_items);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    args.GetCmdLineArgument("repeat", g_repeat);
-    args.GetCmdLineArgument("maxseg", maxseg);
-    args.GetCmdLineArgument("entropy", entropy_reduction);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--n=<input items> "
-            "[--i=<timing iterations> "
-            "[--device=<device-id>] "
-            "[--maxseg=<max segment length>]"
-            "[--entropy=<segment length bit entropy reduction rounds>]"
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "[--cdp]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-    g_device_giga_bandwidth = args.device_giga_bandwidth;
-    printf("\n");
-
-#ifdef QUICKER_TEST
-
-    // Compile/run basic CUB test
-    if (num_items < 0) num_items = 32000000;
-    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
-
-#elif defined(QUICK_TEST)
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Compile/run quick tests
-    if (num_items < 0) num_items = 32000000;
-
-    printf("-- Iterator ----------------------------\n");
-    TestIterator<CUB, int>(        num_items);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, char>(        num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
-    TestPointer<THRUST, char>(     num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, short>(       num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
-    TestPointer<THRUST, short>(    num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, int>(         num_items,                                 entropy_reduction, maxseg);
-    TestPointer<THRUST, int>(      num_items,                                 entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, long long>(   num_items / 2,                             entropy_reduction, maxseg);
-    TestPointer<THRUST, long long>(num_items / 2,                             entropy_reduction, maxseg);
-
-    printf("----------------------------\n");
-    TestPointer<CUB, TestFoo>(     num_items / 4,                             entropy_reduction, maxseg);
-    TestPointer<THRUST, TestFoo>(  num_items / 4,                             entropy_reduction, maxseg);
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test different input types
-        Test<unsigned char>(num_items);
-        Test<unsigned short>(num_items);
-        Test<unsigned int>(num_items);
-        Test<unsigned long long>(num_items);
-
-        Test<uchar2>(num_items);
-        Test<ushort2>(num_items);
-        Test<uint2>(num_items);
-        Test<ulonglong2>(num_items);
-
-        Test<uchar4>(num_items);
-        Test<ushort4>(num_items);
-        Test<uint4>(num_items);
-        Test<ulonglong4>(num_items);
-
-        Test<TestFoo>(num_items);
-        Test<TestBar>(num_items);
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_grid_barrier.cu b/external/cub/test/test_grid_barrier.cu
deleted file mode 100644
index 24a0e3ce2d8..00000000000
--- a/external/cub/test/test_grid_barrier.cu
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test evaluation for software global barrier throughput
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-
-#include <cub/grid/grid_barrier.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * Kernel that iterates through the specified number of software global barriers
- */
-__global__ void Kernel(
-    GridBarrier global_barrier,
-    int iterations)
-{
-    for (int i = 0; i < iterations; i++)
-    {
-        global_barrier.Sync();
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    cudaError_t retval = cudaSuccess;
-
-    // Defaults
-    int iterations = 10000;
-    int block_size = 128;
-    int grid_size = -1;
-
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-
-    // Get args
-    args.GetCmdLineArgument("i", iterations);
-    args.GetCmdLineArgument("grid-size", grid_size);
-    args.GetCmdLineArgument("block-size", block_size);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>]"
-            "[--i=<iterations>]"
-            "[--grid-size<grid-size>]"
-            "[--block-size<block-size>]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get device SM version
-    int sm_version;
-    CubDebugExit(SmVersion(sm_version, device_ordinal));
-
-    // Get SM properties
-    int sm_count, max_block_threads, max_sm_occupancy;
-    CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
-    CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
-    CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
-
-    // Compute grid size and occupancy
-    int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
-
-    if (grid_size == -1)
-    {
-        grid_size = occupancy * sm_count;
-    }
-    else
-    {
-        occupancy = grid_size / sm_count;
-    }
-
-    printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
-        grid_size, block_size, occupancy);
-    fflush(stdout);
-
-    // Init global barrier
-    GridBarrierLifetime global_barrier;
-    global_barrier.Setup(grid_size);
-
-    // Time kernel
-    GpuTimer gpu_timer;
-    gpu_timer.Start();
-    Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
-    gpu_timer.Stop();
-
-    retval = CubDebug(cudaThreadSynchronize());
-
-    // Output timing results
-    float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
-    printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
-        iterations,
-        gpu_timer.ElapsedMillis(),
-        avg_elapsed);
-
-    return retval;
-}
diff --git a/external/cub/test/test_iterator.cu b/external/cub/test/test_iterator.cu
deleted file mode 100644
index fbcbdd2d904..00000000000
--- a/external/cub/test/test_iterator.cu
+++ /dev/null
@@ -1,805 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of iterator utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <iterator>
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/iterator/arg_index_input_iterator.cuh>
-#include <cub/iterator/cache_modified_input_iterator.cuh>
-#include <cub/iterator/cache_modified_output_iterator.cuh>
-#include <cub/iterator/constant_input_iterator.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
-#include <cub/iterator/tex_obj_input_iterator.cuh>
-#include <cub/iterator/tex_ref_input_iterator.cuh>
-#include <cub/iterator/transform_input_iterator.cuh>
-
-#include <cub/util_type.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-#include <thrust/device_ptr.h>
-#include <thrust/copy.h>
-
-using namespace cub;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose = false;
-CachingDeviceAllocator  g_allocator(true);
-
-// Dispatch types
-enum Backend
-{
-    CUB,        // CUB method
-    THRUST,     // Thrust method
-    CDP,        // GPU-based (dynamic parallelism) dispatch to CUB method
-};
-
-
-template <typename T>
-struct TransformOp
-{
-    // Increment transform
-    __host__ __device__ __forceinline__ T operator()(T input) const
-    {
-        T addend;
-        InitValue(INTEGER_SEED, addend, 1);
-        return input + addend;
-    }
-};
-
-struct SelectOp
-{
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(T input)
-    {
-        return true;
-    }
-};
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * Test random access input iterator
- */
-template <
-    typename InputIteratorT,
-    typename T>
-__global__ void Kernel(
-    InputIteratorT    d_in,
-    T                 *d_out,
-    InputIteratorT    *d_itrs)
-{
-    d_out[0] = *d_in;               // Value at offset 0
-    d_out[1] = d_in[100];           // Value at offset 100
-    d_out[2] = *(d_in + 1000);      // Value at offset 1000
-    d_out[3] = *(d_in + 10000);     // Value at offset 10000
-
-    d_in++;
-    d_out[4] = d_in[0];             // Value at offset 1
-
-    d_in += 20;
-    d_out[5] = d_in[0];             // Value at offset 21
-    d_itrs[0] = d_in;               // Iterator at offset 21
-
-    d_in -= 10;
-    d_out[6] = d_in[0];             // Value at offset 11;
-
-    d_in -= 11;
-    d_out[7] = d_in[0];             // Value at offset 0
-    d_itrs[1] = d_in;               // Iterator at offset 0
-}
-
-
-
-//---------------------------------------------------------------------
-// Host testing subroutines
-//---------------------------------------------------------------------
-
-
-/**
- * Run iterator test on device
- */
-template <
-    typename        InputIteratorT,
-    typename        T,
-    int             TEST_VALUES>
-void Test(
-    InputIteratorT  d_in,
-    T               (&h_reference)[TEST_VALUES])
-{
-    // Allocate device arrays
-    T                 *d_out    = NULL;
-    InputIteratorT    *d_itrs   = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
-
-    int compare;
-
-    // Run unguarded kernel
-    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Check results
-    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check iterator at offset 21
-    InputIteratorT h_itr = d_in + 21;
-    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
-    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Check iterator at offset 0
-    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
-    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_itrs) CubDebugExit(g_allocator.DeviceFree(d_itrs));
-}
-
-
-/**
- * Test constant iterator
- */
-template <typename T>
-void TestConstant(T base)
-{
-    printf("\nTesting constant iterator on type %s (base: %lld)\n", typeid(T).name(), (unsigned long long) (base)); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    T h_reference[8] = {base, base, base, base, base, base, base, base};
-    ConstantInputIterator<T> d_itr(base);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    int copy_items  = 100;
-    T   *h_copy     = new T[copy_items];
-    T   *d_copy     = NULL;
-
-    for (int i = 0; i < copy_items; ++i)
-        h_copy[i] = d_itr[i];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-}
-
-
-/**
- * Test counting iterator
- */
-template <typename T>
-void TestCounting(T base)
-{
-    printf("\nTesting counting iterator on type %s (base: %d) \n", typeid(T).name(), int(base)); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = base + 0;          // Value at offset 0
-    h_reference[1] = base + 100;        // Value at offset 100
-    h_reference[2] = base + 1000;       // Value at offset 1000
-    h_reference[3] = base + 10000;      // Value at offset 10000
-    h_reference[4] = base + 1;          // Value at offset 1
-    h_reference[5] = base + 21;         // Value at offset 21
-    h_reference[6] = base + 11;         // Value at offset 11
-    h_reference[7] = base + 0;          // Value at offset 0;
-
-    CountingInputIterator<T> d_itr(base);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    unsigned long long  max_items   = ((1ull << ((sizeof(T) * 8) - 1)) - 1);
-    size_t  copy_items              = (size_t) CUB_MIN(max_items - base, 100);     // potential issue with differencing overflows when T is a smaller type than can handle the offset
-    T                   *h_copy     = new T[copy_items];
-    T                   *d_copy     = NULL;
-
-    for (unsigned long long i = 0; i < copy_items; ++i)
-        h_copy[i] = d_itr[i];
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * copy_items));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-    thrust::copy_if(d_itr, d_itr + copy_items, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, copy_items, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-}
-
-
-/**
- * Test modified iterator
- */
-template <typename T, typename CastT>
-void TestModified()
-{
-    printf("\nTesting cache-modified iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    Test(CacheModifiedInputIterator<LOAD_DEFAULT, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CA, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CG, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CS, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_CV, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_LDG, T>((CastT*) d_data), h_reference);
-    Test(CacheModifiedInputIterator<LOAD_VOLATILE, T>((CastT*) d_data), h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-
-    CacheModifiedInputIterator<LOAD_CG, T> d_in_itr((CastT*) d_data);
-    CacheModifiedOutputIterator<STORE_CG, T> d_out_itr((CastT*) d_copy);
-
-    thrust::copy_if(d_in_itr, d_in_itr + TEST_VALUES, d_out_itr, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-
-/**
- * Test transform iterator
- */
-template <typename T, typename CastT>
-void TestTransform()
-{
-    printf("\nTesting transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        InitValue(INTEGER_SEED, h_data[i], i);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    TransformOp<T> op;
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = op(h_data[0]);          // Value at offset 0
-    h_reference[1] = op(h_data[100]);        // Value at offset 100
-    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
-    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
-    h_reference[4] = op(h_data[1]);          // Value at offset 1
-    h_reference[5] = op(h_data[21]);         // Value at offset 21
-    h_reference[6] = op(h_data[11]);         // Value at offset 11
-    h_reference[7] = op(h_data[0]);          // Value at offset 0;
-
-    TransformInputIterator<T, TransformOp<T>, CastT*> d_itr((CastT*) d_data, op);
-    Test(d_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *h_copy = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-        h_copy[i] = op(h_data[i]);
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(d_itr, d_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-
-/**
- * Test tex-obj texture iterator
- */
-template <typename T, typename CastT>
-void TestTexObj()
-{
-    printf("\nTesting tex-obj iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES          = 11000;
-    const unsigned int DUMMY_OFFSET         = 500;
-    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data   = NULL;
-    T *d_dummy  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    // Create and bind obj-based test iterator
-    TexObjInputIterator<T> d_obj_itr;
-    CubDebugExit(d_obj_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    Test(d_obj_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
-    thrust::copy_if(d_obj_itr, d_obj_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    CubDebugExit(d_obj_itr.UnbindTexture());
-
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
-}
-
-
-#if CUDA_VERSION >= 5050
-
-/**
- * Test tex-ref texture iterator
- */
-template <typename T, typename CastT>
-void TestTexRef()
-{
-    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES          = 11000;
-    const unsigned int DUMMY_OFFSET         = 500;
-    const unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data   = NULL;
-    T *d_dummy  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    // Create and bind ref-based test iterator
-    TexRefInputIterator<T, __LINE__> d_ref_itr;
-    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    // Create and bind dummy iterator of same type to check with interferance
-    TexRefInputIterator<T, __LINE__> d_ref_itr2;
-    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-
-    Test(d_ref_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    CubDebugExit(cudaMemset(d_copy, 0, sizeof(T) * TEST_VALUES));
-    thrust::copy_if(d_ref_itr, d_ref_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_data, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    CubDebugExit(d_ref_itr.UnbindTexture());
-    CubDebugExit(d_ref_itr2.UnbindTexture());
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
-}
-
-
-/**
- * Test texture transform iterator
- */
-template <typename T, typename CastT>
-void TestTexTransform()
-{
-    printf("\nTesting tex-transform iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    const unsigned int TEST_VALUES = 11000;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        InitValue(INTEGER_SEED, h_data[i], i);
-    }
-
-    // Allocate device arrays
-    T *d_data = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    TransformOp<T> op;
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = op(h_data[0]);          // Value at offset 0
-    h_reference[1] = op(h_data[100]);        // Value at offset 100
-    h_reference[2] = op(h_data[1000]);       // Value at offset 1000
-    h_reference[3] = op(h_data[10000]);      // Value at offset 10000
-    h_reference[4] = op(h_data[1]);          // Value at offset 1
-    h_reference[5] = op(h_data[21]);         // Value at offset 21
-    h_reference[6] = op(h_data[11]);         // Value at offset 11
-    h_reference[7] = op(h_data[0]);          // Value at offset 0;
-
-    // Create and bind texture iterator
-    typedef TexRefInputIterator<T, __LINE__> TextureIterator;
-
-    TextureIterator d_tex_itr;
-    CubDebugExit(d_tex_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    // Create transform iterator
-    TransformInputIterator<T, TransformOp<T>, TextureIterator> xform_itr(d_tex_itr, op);
-
-    Test(xform_itr, h_reference);
-
-#if (THRUST_VERSION >= 100700)  // Thrust 1.7 or newer
-
-    //
-    // Test with thrust::copy_if()
-    //
-
-    T *h_copy = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-        h_copy[i] = op(h_data[i]);
-
-    T *d_copy = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_copy, sizeof(T) * TEST_VALUES));
-    thrust::device_ptr<T> d_copy_wrapper(d_copy);
-
-    thrust::copy_if(xform_itr, xform_itr + TEST_VALUES, d_copy_wrapper, SelectOp());
-
-    int compare = CompareDeviceResults(h_copy, d_copy, TEST_VALUES, g_verbose, g_verbose);
-    printf("\tthrust::copy_if(): %s\n", (compare) ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Cleanup
-    if (h_copy) delete[] h_copy;
-    if (d_copy) CubDebugExit(g_allocator.DeviceFree(d_copy));
-
-#endif  // THRUST_VERSION
-
-    CubDebugExit(d_tex_itr.UnbindTexture());
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-}
-
-#endif  // CUDA_VERSION
-
-
-
-
-/**
- * Run non-integer tests
- */
-template <typename T, typename CastT>
-void Test(Int2Type<false> is_integer)
-{
-    TestModified<T, CastT>();
-    TestTransform<T, CastT>();
-
-#if CUB_CDP
-    // Test tex-obj iterators if CUDA dynamic parallelism enabled
-    TestTexObj<T, CastT>(type_string);
-#endif  // CUB_CDP
-
-#if CUDA_VERSION >= 5050
-    // Test tex-ref iterators for CUDA 5.5
-    TestTexRef<T, CastT>();
-    TestTexTransform<T, CastT>();
-#endif  // CUDA_VERSION
-}
-
-/**
- * Run integer tests
- */
-template <typename T, typename CastT>
-void Test(Int2Type<true> is_integer)
-{
-    TestConstant<T>(0);
-    TestConstant<T>(99);
-
-    TestCounting<T>(0);
-    TestCounting<T>(99);
-
-    // Run non-integer tests
-    Test<T, CastT>(Int2Type<false>());
-}
-
-/**
- * Run tests
- */
-template <typename T>
-void Test()
-{
-    enum {
-        IS_INTEGER = (Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER)
-    };
-
-    // Test non-const type
-    Test<T, T>(Int2Type<IS_INTEGER>());
-
-    // Test non-const type
-    Test<T, const T>(Int2Type<IS_INTEGER>());
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // Evaluate different data types
-    Test<char>();
-    Test<short>();
-    Test<int>();
-    Test<long>();
-    Test<long long>();
-    Test<float>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double>();
-
-    Test<char2>();
-    Test<short2>();
-    Test<int2>();
-    Test<long2>();
-    Test<longlong2>();
-    Test<float2>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double2>();
-
-    Test<char3>();
-    Test<short3>();
-    Test<int3>();
-    Test<long3>();
-    Test<longlong3>();
-    Test<float3>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double3>();
-
-    Test<char4>();
-    Test<short4>();
-    Test<int4>();
-    Test<long4>();
-    Test<longlong4>();
-    Test<float4>();
-    if (ptx_version > 120)                          // Don't check doubles on PTX120 or below because they're down-converted
-        Test<double4>();
-
-    Test<TestFoo>();
-    Test<TestBar>();
-
-    printf("\nTest complete\n"); fflush(stdout);
-
-    return 0;
-}
-
-
-
diff --git a/external/cub/test/test_util.h b/external/cub/test/test_util.h
deleted file mode 100644
index 621726214e2..00000000000
--- a/external/cub/test/test_util.h
+++ /dev/null
@@ -1,1600 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-#pragma once
-
-#if defined(_WIN32) || defined(_WIN64)
-    #include <windows.h>
-    #undef small            // Windows is terrible for polluting macro namespace
-#else
-    #include <sys/resource.h>
-#endif
-
-#include <cuda_runtime.h>
-
-#include <stdio.h>
-#include <math.h>
-#include <float.h>
-
-#include <string>
-#include <vector>
-#include <sstream>
-#include <iostream>
-#include <limits>
-
-#include "mersenne.h"
-
-#include "cub/util_debug.cuh"
-#include "cub/util_device.cuh"
-#include "cub/util_type.cuh"
-#include "cub/util_macro.cuh"
-
-/******************************************************************************
- * Assertion macros
- ******************************************************************************/
-
-/**
- * Assert equals
- */
-#define AssertEquals(a, b) if ((a) != (b)) { std::cerr << "\n(" << __FILE__ << ": " << __LINE__ << ")\n"; exit(1);}
-
-
-/******************************************************************************
- * Command-line parsing functionality
- ******************************************************************************/
-
-/**
- * Utility for parsing command line arguments
- */
-struct CommandLineArgs
-{
-
-    std::vector<std::string>    keys;
-    std::vector<std::string>    values;
-    std::vector<std::string>    args;
-    cudaDeviceProp              deviceProp;
-    float                       device_giga_bandwidth;
-    size_t                      device_free_physmem;
-    size_t                      device_total_physmem;
-
-    /**
-     * Constructor
-     */
-    CommandLineArgs(int argc, char **argv) :
-        keys(10),
-        values(10)
-    {
-        using namespace std;
-
-        // Initialize mersenne generator
-        unsigned int mersenne_init[4]=  {0x123, 0x234, 0x345, 0x456};
-        mersenne::init_by_array(mersenne_init, 4);
-
-        for (int i = 1; i < argc; i++)
-        {
-            string arg = argv[i];
-
-            if ((arg[0] != '-') || (arg[1] != '-'))
-            {
-                args.push_back(arg);
-                continue;
-            }
-
-            string::size_type pos;
-            string key, val;
-            if ((pos = arg.find('=')) == string::npos) {
-                key = string(arg, 2, arg.length() - 2);
-                val = "";
-            } else {
-                key = string(arg, 2, pos - 2);
-                val = string(arg, pos + 1, arg.length() - 1);
-            }
-
-            keys.push_back(key);
-            values.push_back(val);
-        }
-    }
-
-
-    /**
-     * Checks whether a flag "--<flag>" is present in the commandline
-     */
-    bool CheckCmdLineFlag(const char* arg_name)
-    {
-        using namespace std;
-
-        for (int i = 0; i < int(keys.size()); ++i)
-        {
-            if (keys[i] == string(arg_name))
-                return true;
-        }
-        return false;
-    }
-
-
-    /**
-     * Returns number of naked (non-flag and non-key-value) commandline parameters
-     */
-    template <typename T>
-    int NumNakedArgs()
-    {
-        return args.size();
-    }
-
-
-    /**
-     * Returns the commandline parameter for a given index (not including flags)
-     */
-    template <typename T>
-    void GetCmdLineArgument(int index, T &val)
-    {
-        using namespace std;
-        if (index < args.size()) {
-            istringstream str_stream(args[index]);
-            str_stream >> val;
-        }
-    }
-
-    /**
-     * Returns the value specified for a given commandline parameter --<flag>=<value>
-     */
-    template <typename T>
-    void GetCmdLineArgument(const char *arg_name, T &val)
-    {
-        using namespace std;
-
-        for (int i = 0; i < int(keys.size()); ++i)
-        {
-            if (keys[i] == string(arg_name))
-            {
-                istringstream str_stream(values[i]);
-                str_stream >> val;
-            }
-        }
-    }
-
-
-    /**
-     * Returns the values specified for a given commandline parameter --<flag>=<value>,<value>*
-     */
-    template <typename T>
-    void GetCmdLineArguments(const char *arg_name, std::vector<T> &vals)
-    {
-        using namespace std;
-
-        if (CheckCmdLineFlag(arg_name))
-        {
-            // Clear any default values
-            vals.clear();
-
-            // Recover from multi-value string
-            for (int i = 0; i < keys.size(); ++i)
-            {
-                if (keys[i] == string(arg_name))
-                {
-                    string val_string(values[i]);
-                    istringstream str_stream(val_string);
-                    string::size_type old_pos = 0;
-                    string::size_type new_pos = 0;
-
-                    // Iterate comma-separated values
-                    T val;
-                    while ((new_pos = val_string.find(',', old_pos)) != string::npos)
-                    {
-                        if (new_pos != old_pos)
-                        {
-                            str_stream.width(new_pos - old_pos);
-                            str_stream >> val;
-                            vals.push_back(val);
-                        }
-
-                        // skip over comma
-                        str_stream.ignore(1);
-                        old_pos = new_pos + 1;
-                    }
-
-                    // Read last value
-                    str_stream >> val;
-                    vals.push_back(val);
-                }
-            }
-        }
-    }
-
-
-    /**
-     * The number of pairs parsed
-     */
-    int ParsedArgc()
-    {
-        return (int) keys.size();
-    }
-
-    /**
-     * Initialize device
-     */
-    cudaError_t DeviceInit(int dev = -1)
-    {
-        cudaError_t error = cudaSuccess;
-
-        do
-        {
-            int deviceCount;
-            error = CubDebug(cudaGetDeviceCount(&deviceCount));
-            if (error) break;
-
-            if (deviceCount == 0) {
-                fprintf(stderr, "No devices supporting CUDA.\n");
-                exit(1);
-            }
-            if (dev < 0)
-            {
-                GetCmdLineArgument("device", dev);
-            }
-            if ((dev > deviceCount - 1) || (dev < 0))
-            {
-                dev = 0;
-            }
-
-            error = CubDebug(cudaSetDevice(dev));
-            if (error) break;
-
-            CubDebugExit(cudaMemGetInfo(&device_free_physmem, &device_total_physmem));
-
-            int ptx_version;
-            error = CubDebug(cub::PtxVersion(ptx_version));
-            if (error) break;
-
-            error = CubDebug(cudaGetDeviceProperties(&deviceProp, dev));
-            if (error) break;
-
-            if (deviceProp.major < 1) {
-                fprintf(stderr, "Device does not support CUDA.\n");
-                exit(1);
-            }
-
-            device_giga_bandwidth = float(deviceProp.memoryBusWidth) * deviceProp.memoryClockRate * 2 / 8 / 1000 / 1000;
-
-            if (!CheckCmdLineFlag("quiet"))
-            {
-                printf(
-                        "Using device %d: %s (PTX version %d, SM%d, %d SMs, "
-                        "%lld free / %lld total MB physmem, "
-                        "%.3f GB/s @ %d kHz mem clock, ECC %s)\n",
-                    dev,
-                    deviceProp.name,
-                    ptx_version,
-                    deviceProp.major * 100 + deviceProp.minor * 10,
-                    deviceProp.multiProcessorCount,
-                    (unsigned long long) device_free_physmem / 1024 / 1024,
-                    (unsigned long long) device_total_physmem / 1024 / 1024,
-                    device_giga_bandwidth,
-                    deviceProp.memoryClockRate,
-                    (deviceProp.ECCEnabled) ? "on" : "off");
-                fflush(stdout);
-            }
-
-        } while (0);
-
-        return error;
-    }
-};
-
-/******************************************************************************
- * Random bits generator
- ******************************************************************************/
-
-int g_num_rand_samples = 0;
-
-
-template <typename T>
-bool IsNaN(T val) { return false; }
-
-template<>
-__noinline__ bool IsNaN<float>(float val)
-{
-    volatile unsigned int bits = reinterpret_cast<unsigned int &>(val);
-
-    return (((bits >= 0x7F800001) && (bits <= 0x7FFFFFFF)) || 
-        ((bits >= 0xFF800001) && (bits <= 0xFFFFFFFF)));
-}
-
-template<>
-__noinline__ bool IsNaN<float1>(float1 val)
-{
-    return (IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float2>(float2 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float3>(float3 val)
-{
-    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<float4>(float4 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
-}
-
-template<>
-__noinline__ bool IsNaN<double>(double val)
-{
-    volatile unsigned long long bits = *reinterpret_cast<unsigned long long *>(&val);
-
-    return (((bits >= 0x7FF0000000000001) && (bits <= 0x7FFFFFFFFFFFFFFF)) || 
-        ((bits >= 0xFFF0000000000001) && (bits <= 0xFFFFFFFFFFFFFFFF)));
-}
-
-template<>
-__noinline__ bool IsNaN<double1>(double1 val)
-{
-    return (IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double2>(double2 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double3>(double3 val)
-{
-    return (IsNaN(val.z) || IsNaN(val.y) || IsNaN(val.x));
-}
-
-template<>
-__noinline__ bool IsNaN<double4>(double4 val)
-{
-    return (IsNaN(val.y) || IsNaN(val.x) || IsNaN(val.w) || IsNaN(val.z));
-}
-
-
-/**
- * Generates random keys.
- *
- * We always take the second-order byte from rand() because the higher-order
- * bits returned by rand() are commonly considered more uniformly distributed
- * than the lower-order bits.
- *
- * We can decrease the entropy level of keys by adopting the technique
- * of Thearling and Smith in which keys are computed from the bitwise AND of
- * multiple random samples:
- *
- * entropy_reduction    | Effectively-unique bits per key
- * -----------------------------------------------------
- * -1                   | 0
- * 0                    | 32
- * 1                    | 25.95 (81%)
- * 2                    | 17.41 (54%)
- * 3                    | 10.78 (34%)
- * 4                    | 6.42 (20%)
- * ...                  | ...
- *
- */
-template <typename K>
-void RandomBits(
-    K &key,
-    int entropy_reduction = 0,
-    int begin_bit = 0,
-    int end_bit = sizeof(K) * 8)
-{
-    const int NUM_BYTES = sizeof(K);
-    const int WORD_BYTES = sizeof(unsigned int);
-    const int NUM_WORDS = (NUM_BYTES + WORD_BYTES - 1) / WORD_BYTES;
-
-    unsigned int word_buff[NUM_WORDS];
-
-    if (entropy_reduction == -1)
-    {
-        memset((void *) &key, 0, sizeof(key));
-        return;
-    }
-
-    if (end_bit < 0)
-        end_bit = sizeof(K) * 8;
-
-    while (true) 
-    {
-        // Generate random word_buff
-        for (int j = 0; j < NUM_WORDS; j++)
-        {
-            int current_bit = j * WORD_BYTES * 8;
-
-            unsigned int word = 0xffffffff;
-            word &= 0xffffffff << CUB_MAX(0, begin_bit - current_bit);
-            word &= 0xffffffff >> CUB_MAX(0, (current_bit + (WORD_BYTES * 8)) - end_bit);
-
-            for (int i = 0; i <= entropy_reduction; i++)
-            {
-                // Grab some of the higher bits from rand (better entropy, supposedly)
-                word &= mersenne::genrand_int32();
-                g_num_rand_samples++;                
-            }
-
-            word_buff[j] = word;
-        }
-
-        memcpy(&key, word_buff, sizeof(K));
-
-        K copy = key;
-        if (!IsNaN(copy))
-            break;          // avoids NaNs when generating random floating point numbers
-    }
-}
-
-/// Randomly select number between [0:max)
-template <typename T>
-T RandomValue(T max)
-{
-    unsigned int bits;
-    unsigned int max_int = (unsigned int) -1;
-    do {
-        RandomBits(bits);
-    } while (bits == max_int);
-
-    return (T) ((double(bits) / double(max_int)) * double(max));
-}
-
-
-/******************************************************************************
- * Console printing utilities
- ******************************************************************************/
-
-/**
- * Helper for casting character types to integers for cout printing
- */
-template <typename T>
-T CoutCast(T val) { return val; }
-
-int CoutCast(char val) { return val; }
-
-int CoutCast(unsigned char val) { return val; }
-
-int CoutCast(signed char val) { return val; }
-
-
-
-/******************************************************************************
- * Test value initialization utilities
- ******************************************************************************/
-
-/**
- * Test problem generation options
- */
-enum GenMode
-{
-    UNIFORM,            // Assign to '2', regardless of integer seed
-    INTEGER_SEED,       // Assign to integer seed
-    RANDOM,             // Assign to random, regardless of integer seed
-};
-
-/**
- * Initialize value
- */
-template <typename T>
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)
-{
-    switch (gen_mode)
-    {
-#if (CUB_PTX_ARCH == 0)
-    case RANDOM:
-         RandomBits(value);
-         break;
-#endif
-     case UNIFORM:
-        value = 2;
-        break;
-    case INTEGER_SEED:
-    default:
-         value = (T) index;
-        break;
-    }
-}
-
-
-/**
- * Initialize value (bool)
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, bool &value, int index = 0)
-{
-    switch (gen_mode)
-    {
-#if (CUB_PTX_ARCH == 0)
-    case RANDOM:
-        char c;
-        RandomBits(c, 0, 0, 1);
-        value = (c > 0);
-        break;
-#endif
-     case UNIFORM:
-        value = true;
-        break;
-    case INTEGER_SEED:
-    default:
-        value = (index > 0);
-        break;
-    }
-}
-
-
-/**
- * cub::NullType test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, cub::NullType &value, int index = 0)
-{}
-
-
-/**
- * cub::KeyValuePair<OffsetT, ValueT>test initialization
- */
-template <typename KeyT, typename ValueT>
-__host__ __device__ __forceinline__ void InitValue(
-    GenMode                             gen_mode,
-    cub::KeyValuePair<KeyT, ValueT>&    value,
-    int                                 index = 0)
-{
-    InitValue(gen_mode, value.value, index);
-
-    // Assign corresponding flag with a likelihood of the last bit being set with entropy-reduction level 3
-    RandomBits(value.key, 3);
-    value.key = (value.key & 0x1);
-}
-
-
-
-/******************************************************************************
- * Comparison and ostream operators
- ******************************************************************************/
-
-/**
- * KeyValuePair ostream operator
- */
-template <typename Key, typename Value>
-std::ostream& operator<<(std::ostream& os, const cub::KeyValuePair<Key, Value> &val)
-{
-    os << '(' << CoutCast(val.key) << ',' << CoutCast(val.value) << ')';
-    return os;
-}
-
-
-/******************************************************************************
- * Comparison and ostream operators for CUDA vector types
- ******************************************************************************/
-
-/**
- * Vector1 overloads
- */
-#define CUB_VEC_OVERLOAD_1(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '(' << CoutCast(val.x) << ')';                \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x);                                \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x);                                \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x > b.x);                                 \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x < b.x);                                 \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                       \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(a.x + b.x);                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace std */
-
-
-
-/**
- * Vector2 overloads
- */
-#define CUB_VEC_OVERLOAD_2(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        return a.y > b.y;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        return a.y < b.y;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                         \
-        T b)                                         \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-
-
-/**
- * Vector3 overloads
- */
-#define CUB_VEC_OVERLOAD_3(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ','                       \
-            << CoutCast(val.z) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y) ||                                 \
-            (a.z != b.z);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y) &&                                 \
-            (a.z == b.z);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-        InitValue(gen_mode, value.z, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
-        return a.z > b.z;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
-        return a.z < b.z;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y,                                      \
-            a.z + b.z);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-
-/**
- * Vector4 overloads
- */
-#define CUB_VEC_OVERLOAD_4(T, BaseT)                        \
-    /* Ostream output */                                    \
-    std::ostream& operator<<(                               \
-        std::ostream& os,                                   \
-        const T& val)                                       \
-    {                                                       \
-        os << '('                                           \
-            << CoutCast(val.x) << ','                       \
-            << CoutCast(val.y) << ','                       \
-            << CoutCast(val.z) << ','                       \
-            << CoutCast(val.w) << ')';                      \
-        return os;                                          \
-    }                                                       \
-    /* Inequality */                                        \
-    __host__ __device__ __forceinline__ bool operator!=(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x != b.x) ||                              \
-            (a.y != b.y) ||                                 \
-            (a.z != b.z) ||                                 \
-            (a.w != b.w);                                   \
-    }                                                       \
-    /* Equality */                                          \
-    __host__ __device__ __forceinline__ bool operator==(    \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        return (a.x == b.x) &&                              \
-            (a.y == b.y) &&                                 \
-            (a.z == b.z) &&                                 \
-            (a.w == b.w);                                   \
-    }                                                       \
-    /* Test initialization */                               \
-    __host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, T &value, int index = 0)   \
-    {                                                       \
-        InitValue(gen_mode, value.x, index);                \
-        InitValue(gen_mode, value.y, index);                \
-        InitValue(gen_mode, value.z, index);                \
-        InitValue(gen_mode, value.w, index);                \
-    }                                                       \
-    /* Max */                                               \
-    __host__ __device__ __forceinline__ bool operator>(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x > b.x) return true; else if (b.x > a.x) return false;   \
-        if (a.y > b.y) return true; else if (b.y > a.y) return false;   \
-        if (a.z > b.z) return true; else if (b.z > a.z) return false;   \
-        return a.w > b.w;                                               \
-    }                                                       \
-    /* Min */                                               \
-    __host__ __device__ __forceinline__ bool operator<(     \
-        const T &a,                                         \
-        const T &b)                                         \
-    {                                                       \
-        if (a.x < b.x) return true; else if (b.x < a.x) return false;   \
-        if (a.y < b.y) return true; else if (b.y < a.y) return false;   \
-        if (a.z < b.z) return true; else if (b.z < a.z) return false;   \
-        return a.w < b.w;                                               \
-    }                                                       \
-    /* Summation (non-reference addends for VS2003 -O3 warpscan workaround */                                         \
-    __host__ __device__ __forceinline__ T operator+(        \
-        T a,                                                \
-        T b)                                                \
-    {                                                       \
-        T retval = make_##T(                                        \
-            a.x + b.x,                                      \
-            a.y + b.y,                                      \
-            a.z + b.z,                                      \
-            a.w + b.w);                                     \
-        return retval;                                      \
-    }                                                       \
-    namespace cub {                                         \
-    template<>                                              \
-    struct NumericTraits<T>                                 \
-    {                                                       \
-        static const Category CATEGORY = NOT_A_NUMBER;      \
-        enum {                                              \
-            PRIMITIVE       = false,                        \
-            NULL_TYPE       = false,                        \
-        };                                                  \
-        static T Max()                                      \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max(),                \
-                NumericTraits<BaseT>::Max()};               \
-            return retval;                                  \
-        }                                                   \
-        static T Lowest()                                   \
-        {                                                   \
-            T retval = {                                    \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest(),             \
-                NumericTraits<BaseT>::Lowest()};            \
-            return retval;                                  \
-        }                                                   \
-    };                                                      \
-    } /* namespace cub */
-
-/**
- * All vector overloads
- */
-#define CUB_VEC_OVERLOAD(COMPONENT_T, BaseT)                    \
-    CUB_VEC_OVERLOAD_1(COMPONENT_T##1, BaseT)                   \
-    CUB_VEC_OVERLOAD_2(COMPONENT_T##2, BaseT)                   \
-    CUB_VEC_OVERLOAD_3(COMPONENT_T##3, BaseT)                   \
-    CUB_VEC_OVERLOAD_4(COMPONENT_T##4, BaseT)
-
-/**
- * Define for types
- */
-CUB_VEC_OVERLOAD(char, char)
-CUB_VEC_OVERLOAD(short, short)
-CUB_VEC_OVERLOAD(int, int)
-CUB_VEC_OVERLOAD(long, long)
-CUB_VEC_OVERLOAD(longlong, long long)
-CUB_VEC_OVERLOAD(uchar, unsigned char)
-CUB_VEC_OVERLOAD(ushort, unsigned short)
-CUB_VEC_OVERLOAD(uint, unsigned int)
-CUB_VEC_OVERLOAD(ulong, unsigned long)
-CUB_VEC_OVERLOAD(ulonglong, unsigned long long)
-CUB_VEC_OVERLOAD(float, float)
-CUB_VEC_OVERLOAD(double, double)
-
-
-//---------------------------------------------------------------------
-// Complex data type TestFoo
-//---------------------------------------------------------------------
-
-/**
- * TestFoo complex data type
- */
-struct TestFoo
-{
-    long long   x;
-    int         y;
-    short       z;
-    char        w;
-
-    // Factory
-    static __host__ __device__ __forceinline__ TestFoo MakeTestFoo(long long x, int y, short z, char w)
-    {
-        TestFoo retval = {x, y, z, w};
-        return retval;
-    }
-
-    // Assignment from int operator
-    __host__ __device__ __forceinline__ TestFoo& operator =(int b)
-    {
-        x = b;
-        y = b;
-        z = b;
-        w = b;
-        return *this;
-    }
-
-    // Summation operator
-    __host__ __device__ __forceinline__ TestFoo operator+(const TestFoo &b) const
-    {
-        return MakeTestFoo(x + b.x, y + b.y, z + b.z, w + b.w);
-    }
-
-    // Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const TestFoo &b) const
-    {
-        return (x != b.x) || (y != b.y) || (z != b.z) || (w != b.w);
-    }
-
-    // Equality operator
-    __host__ __device__ __forceinline__ bool operator ==(const TestFoo &b) const
-    {
-        return (x == b.x) && (y == b.y) && (z == b.z) && (w == b.w);
-    }
-
-    // Less than operator
-    __host__ __device__ __forceinline__ bool operator <(const TestFoo &b) const
-    {
-        if (x < b.x) return true; else if (b.x < x) return false;
-        if (y < b.y) return true; else if (b.y < y) return false;
-        if (z < b.z) return true; else if (b.z < z) return false;
-        return w < b.w;
-    }
-
-    // Greater than operator
-    __host__ __device__ __forceinline__ bool operator >(const TestFoo &b) const
-    {
-        if (x > b.x) return true; else if (b.x > x) return false;
-        if (y > b.y) return true; else if (b.y > y) return false;
-        if (z > b.z) return true; else if (b.z > z) return false;
-        return w > b.w;
-    }
-
-};
-
-/**
- * TestFoo ostream operator
- */
-std::ostream& operator<<(std::ostream& os, const TestFoo& val)
-{
-    os << '(' << val.x << ',' << val.y << ',' << val.z << ',' << CoutCast(val.w) << ')';
-    return os;
-}
-
-/**
- * TestFoo test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestFoo &value, int index = 0)
-{
-    InitValue(gen_mode, value.x, index);
-    InitValue(gen_mode, value.y, index);
-    InitValue(gen_mode, value.z, index);
-    InitValue(gen_mode, value.w, index);
-}
-
-
-/// numeric_limits<TestFoo> specialization
-namespace cub {
-template<>
-struct NumericTraits<TestFoo>
-{
-    static const Category CATEGORY = NOT_A_NUMBER;
-    enum {
-        PRIMITIVE       = false,
-        NULL_TYPE       = false,
-    };
-    static TestFoo Max()
-    {
-        return TestFoo::MakeTestFoo(
-            NumericTraits<long long>::Max(),
-            NumericTraits<int>::Max(),
-            NumericTraits<short>::Max(),
-            NumericTraits<char>::Max());
-    }
-
-    static TestFoo Lowest()
-    {
-        return TestFoo::MakeTestFoo(
-            NumericTraits<long long>::Lowest(),
-            NumericTraits<int>::Lowest(),
-            NumericTraits<short>::Lowest(),
-            NumericTraits<char>::Lowest());
-    }
-};
-} // namespace cub
-
-
-//---------------------------------------------------------------------
-// Complex data type TestBar (with optimizations for fence-free warp-synchrony)
-//---------------------------------------------------------------------
-
-/**
- * TestBar complex data type
- */
-struct TestBar
-{
-    long long       x;
-    int             y;
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar() : x(0), y(0)
-    {}
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar(int b) : x(b), y(b)
-    {}
-
-    // Constructor
-    __host__ __device__ __forceinline__ TestBar(long long x, int y) : x(x), y(y)
-    {}
-
-    // Assignment from int operator
-    __host__ __device__ __forceinline__ TestBar& operator =(int b)
-    {
-        x = b;
-        y = b;
-        return *this;
-    }
-
-    // Summation operator
-    __host__ __device__ __forceinline__ TestBar operator+(const TestBar &b) const
-    {
-        return TestBar(x + b.x, y + b.y);
-    }
-
-    // Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const TestBar &b) const
-    {
-        return (x != b.x) || (y != b.y);
-    }
-
-    // Equality operator
-    __host__ __device__ __forceinline__ bool operator ==(const TestBar &b) const
-    {
-        return (x == b.x) && (y == b.y);
-    }
-
-    // Less than operator
-    __host__ __device__ __forceinline__ bool operator <(const TestBar &b) const
-    {
-        if (x < b.x) return true; else if (b.x < x) return false;
-        return y < b.y;
-    }
-
-    // Greater than operator
-    __host__ __device__ __forceinline__ bool operator >(const TestBar &b) const
-    {
-        if (x > b.x) return true; else if (b.x > x) return false;
-        return y > b.y;
-    }
-
-};
-
-
-/**
- * TestBar ostream operator
- */
-std::ostream& operator<<(std::ostream& os, const TestBar& val)
-{
-    os << '(' << val.x << ',' << val.y << ')';
-    return os;
-}
-
-/**
- * TestBar test initialization
- */
-__host__ __device__ __forceinline__ void InitValue(GenMode gen_mode, TestBar &value, int index = 0)
-{
-    InitValue(gen_mode, value.x, index);
-    InitValue(gen_mode, value.y, index);
-}
-
-/// numeric_limits<TestBar> specialization
-namespace cub {
-template<>
-struct NumericTraits<TestBar>
-{
-    static const Category CATEGORY = NOT_A_NUMBER;
-    enum {
-        PRIMITIVE       = false,
-        NULL_TYPE       = false,
-    };
-    static TestBar Max()
-    {
-        return TestBar(
-            NumericTraits<long long>::Max(),
-            NumericTraits<int>::Max());
-    }
-
-    static TestBar Lowest()
-    {
-        return TestBar(
-            NumericTraits<long long>::Lowest(),
-            NumericTraits<int>::Lowest());
-    }
-};
-} // namespace cub
-
-
-/******************************************************************************
- * Helper routines for list comparison and display
- ******************************************************************************/
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename S, typename T, typename OffsetT>
-int CompareResults(T* computed, S* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                << CoutCast(computed[i]) << " != "
-                << CoutCast(reference[i]);
-            return 1;
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(float* computed, float* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            float difference = std::abs(computed[i]-reference[i]);
-            float fraction = difference / std::abs(reference[i]);
-
-            if (fraction > 0.0001)
-            {
-                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                    << "(computed) " << CoutCast(computed[i]) << " != "
-                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(cub::NullType* computed, cub::NullType* reference, OffsetT len, bool verbose = true)
-{
-    return 0;
-}
-
-/**
- * Compares the equivalence of two arrays
- */
-template <typename OffsetT>
-int CompareResults(double* computed, double* reference, OffsetT len, bool verbose = true)
-{
-    for (OffsetT i = 0; i < len; i++)
-    {
-        if (computed[i] != reference[i])
-        {
-            double difference = std::abs(computed[i]-reference[i]);
-            double fraction = difference / std::abs(reference[i]);
-
-            if (fraction > 0.0001)
-            {
-                if (verbose) std::cout << "INCORRECT: [" << i << "]: "
-                    << CoutCast(computed[i]) << " != "
-                    << CoutCast(reference[i]) << " (difference:" << difference << ", fraction: " << fraction << ")";
-                return 1;
-            }
-        }
-    }
-    return 0;
-}
-
-
-/**
- * Verify the contents of a device array match those
- * of a host array
- */
-int CompareDeviceResults(
-    cub::NullType *h_reference,
-    cub::NullType *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    return 0;
-}
-
-
-/**
- * Verify the contents of a device array match those
- * of a host array
- */
-template <typename S, typename T>
-int CompareDeviceResults(
-    S *h_reference,
-    T *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    // Allocate array on host
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    // Display data
-    if (display_data)
-    {
-        printf("Reference:\n");
-        for (int i = 0; i < int(num_items); i++)
-        {
-            std::cout << CoutCast(h_reference[i]) << ", ";
-        }
-        printf("\n\nComputed:\n");
-        for (int i = 0; i < int(num_items); i++)
-        {
-            std::cout << CoutCast(h_data[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Check
-    int retval = CompareResults(h_data, h_reference, num_items, verbose);
-
-    // Cleanup
-    if (h_data) free(h_data);
-
-    return retval;
-}
-
-
-/**
- * Verify the contents of a device array match those
- * of a device array
- */
-template <typename T>
-int CompareDeviceDeviceResults(
-    T *d_reference,
-    T *d_data,
-    size_t num_items,
-    bool verbose = true,
-    bool display_data = false)
-{
-    // Allocate array on host
-    T *h_reference = (T*) malloc(num_items * sizeof(T));
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_reference, d_reference, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    // Display data
-    if (display_data) {
-        printf("Reference:\n");
-        for (int i = 0; i < num_items; i++)
-        {
-            std::cout << CoutCast(h_reference[i]) << ", ";
-        }
-        printf("\n\nComputed:\n");
-        for (int i = 0; i < num_items; i++)
-        {
-            std::cout << CoutCast(h_data[i]) << ", ";
-        }
-        printf("\n\n");
-    }
-
-    // Check
-    int retval = CompareResults(h_data, h_reference, num_items, verbose);
-
-    // Cleanup
-    if (h_reference) free(h_reference);
-    if (h_data) free(h_data);
-
-    return retval;
-}
-
-
-/**
- * Print the contents of a host array
- */
-void DisplayResults(
-    cub::NullType   *h_data,
-    size_t          num_items)
-{}
-
-
-/**
- * Print the contents of a host array
- */
-template <typename InputIteratorT>
-void DisplayResults(
-    InputIteratorT h_data,
-    size_t num_items)
-{
-    // Display data
-    for (int i = 0; i < int(num_items); i++)
-    {
-        std::cout << CoutCast(h_data[i]) << ", ";
-    }
-    printf("\n");
-}
-
-
-/**
- * Print the contents of a device array
- */
-template <typename T>
-void DisplayDeviceResults(
-    T *d_data,
-    size_t num_items)
-{
-    // Allocate array on host
-    T *h_data = (T*) malloc(num_items * sizeof(T));
-
-    // Copy data back
-    cudaMemcpy(h_data, d_data, sizeof(T) * num_items, cudaMemcpyDeviceToHost);
-
-    DisplayResults(h_data, num_items);
-
-    // Cleanup
-    if (h_data) free(h_data);
-}
-
-
-/******************************************************************************
- * Segment descriptor generation
- ******************************************************************************/
-
-/**
- * Initialize segments
- */
-void InitializeSegments(
-    int     num_items,
-    int     num_segments,
-    int     *h_segment_offsets,
-    bool    verbose = false)
-{
-    if (num_segments <= 0)
-        return;
-
-    unsigned int expected_segment_length = (num_items + num_segments - 1) / num_segments;
-    int offset = 0;
-    for (int i = 0; i < num_segments; ++i)
-    {
-        h_segment_offsets[i] = offset;
-
-        unsigned int segment_length = RandomValue((expected_segment_length * 2) + 1);
-        offset += segment_length;
-        offset = CUB_MIN(offset, num_items);
-    }
-    h_segment_offsets[num_segments] = num_items;
-
-    if (verbose)
-    {
-        printf("Segment offsets: ");
-        DisplayResults(h_segment_offsets, num_segments + 1);
-    }
-}
-
-
-/******************************************************************************
- * Timing
- ******************************************************************************/
-
-
-struct CpuTimer
-{
-#if defined(_WIN32) || defined(_WIN64)
-
-    LARGE_INTEGER ll_freq;
-    LARGE_INTEGER ll_start;
-    LARGE_INTEGER ll_stop;
-
-    CpuTimer()
-    {
-        QueryPerformanceFrequency(&ll_freq);
-    }
-
-    void Start()
-    {
-        QueryPerformanceCounter(&ll_start);
-    }
-
-    void Stop()
-    {
-        QueryPerformanceCounter(&ll_stop);
-    }
-
-    float ElapsedMillis()
-    {
-        double start = double(ll_start.QuadPart) / double(ll_freq.QuadPart);
-        double stop  = double(ll_stop.QuadPart) / double(ll_freq.QuadPart);
-
-        return float((stop - start) * 1000);
-    }
-
-#else
-
-    rusage start;
-    rusage stop;
-
-    void Start()
-    {
-        getrusage(RUSAGE_SELF, &start);
-    }
-
-    void Stop()
-    {
-        getrusage(RUSAGE_SELF, &stop);
-    }
-
-    float ElapsedMillis()
-    {
-        float sec = stop.ru_utime.tv_sec - start.ru_utime.tv_sec;
-        float usec = stop.ru_utime.tv_usec - start.ru_utime.tv_usec;
-
-        return (sec * 1000) + (usec / 1000);
-    }
-
-#endif
-};
-
-struct GpuTimer
-{
-    cudaEvent_t start;
-    cudaEvent_t stop;
-
-    GpuTimer()
-    {
-        cudaEventCreate(&start);
-        cudaEventCreate(&stop);
-    }
-
-    ~GpuTimer()
-    {
-        cudaEventDestroy(start);
-        cudaEventDestroy(stop);
-    }
-
-    void Start()
-    {
-        cudaEventRecord(start, 0);
-    }
-
-    void Stop()
-    {
-        cudaEventRecord(stop, 0);
-    }
-
-    float ElapsedMillis()
-    {
-        float elapsed;
-        cudaEventSynchronize(stop);
-        cudaEventElapsedTime(&elapsed, start, stop);
-        return elapsed;
-    }
-};
diff --git a/external/cub/test/test_warp_reduce.cu b/external/cub/test/test_warp_reduce.cu
deleted file mode 100644
index 130f20e3e87..00000000000
--- a/external/cub/test/test_warp_reduce.cu
+++ /dev/null
@@ -1,840 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of WarpReduce utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/warp/warp_reduce.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<
-    typename    OpT,
-    int         LOGICAL_WARP_THREADS>
-struct WrapperFunctor
-{
-    OpT op;
-    int num_valid;
-
-    inline __host__ __device__ WrapperFunctor(OpT op, int num_valid) : op(op), num_valid(num_valid) {}
-
-    template <typename T>
-    inline __host__ __device__ T operator()(const T &a, const T &b) const
-    {
-#if CUB_PTX_ARCH != 0
-        if ((cub::LaneId() % LOGICAL_WARP_THREADS) >= num_valid)
-            cub::ThreadTrap();
-#endif
-
-        return op(a, b);
-    }
-
-};
-
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/**
- * Generic reduction
- */
-template <
-    typename    T,
-    typename    ReductionOp,
-    typename    WarpReduce,
-    bool        PRIMITIVE = Traits<T>::PRIMITIVE>
-struct DeviceTest
-{
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).Reduce(data, reduction_op);
-    }
-
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        ReductionOp                         &reduction_op,
-        const int                           &valid_warp_threads)
-    {
-        return WarpReduce(temp_storage).Reduce(data, reduction_op, valid_warp_threads);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T HeadSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).HeadSegmentedReduce(data, flag, reduction_op);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T TailSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        ReductionOp                         &reduction_op)
-    {
-        return WarpReduce(temp_storage).TailSegmentedReduce(data, flag, reduction_op);
-    }
-
-};
-
-
-/**
- * Summation
- */
-template <
-    typename    T,
-    typename    WarpReduce>
-struct DeviceTest<T, Sum, WarpReduce, true>
-{
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).Sum(data);
-    }
-
-    static __device__ __forceinline__ T Reduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        Sum                              &reduction_op,
-        const int                           &valid_warp_threads)
-    {
-        return WarpReduce(temp_storage).Sum(data, valid_warp_threads);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T HeadSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).HeadSegmentedSum(data, flag);
-    }
-
-    template <typename FlagT>
-    static __device__ __forceinline__ T TailSegmentedReduce(
-        typename WarpReduce::TempStorage    &temp_storage,
-        T                                   &data,
-        FlagT                                &flag,
-        Sum                              &reduction_op)
-    {
-        return WarpReduce(temp_storage).TailSegmentedSum(data, flag);
-    }
-
-};
-
-
-/**
- * Full-tile warp reduction kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-__global__ void FullWarpReduceKernel(
-    T               *d_in,
-    T               *d_out,
-    ReductionOp     reduction_op,
-    clock_t         *d_elapsed)
-{
-    // Cooperative warp-reduce utility type (1 warp)
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T input = d_in[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
-        temp_storage[warp_id], input, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
-        output :
-        input;
-}
-
-/**
- * Partially-full warp reduction kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-__global__ void PartialWarpReduceKernel(
-    T           *d_in,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed,
-    int         valid_warp_threads)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T input = d_in[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test partial-warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::Reduce(
-        temp_storage[warp_id], input, reduction_op, valid_warp_threads);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = (threadIdx.x % LOGICAL_WARP_THREADS == 0) ?
-        output :
-        input;
-}
-
-
-/**
- * Head-based segmented warp reduction test kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    FlagT,
-    typename    ReductionOp>
-__global__ void WarpHeadSegmentedReduceKernel(
-    T           *d_in,
-    FlagT        *d_head_flags,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T       input       = d_in[threadIdx.x];
-    FlagT   head_flag   = d_head_flags[threadIdx.x];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test segmented warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::HeadSegmentedReduce(
-        temp_storage[warp_id], input, head_flag, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
-        output :
-        input;
-}
-
-
-/**
- * Tail-based segmented warp reduction test kernel
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    FlagT,
-    typename    ReductionOp>
-__global__ void WarpTailSegmentedReduceKernel(
-    T           *d_in,
-    FlagT       *d_tail_flags,
-    T           *d_out,
-    ReductionOp reduction_op,
-    clock_t     *d_elapsed)
-{
-    // Cooperative warp-reduce utility type
-    typedef WarpReduce<T, LOGICAL_WARP_THREADS> WarpReduce;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpReduce::TempStorage temp_storage[WARPS];
-
-    // Per-thread tile data
-    T       input       = d_in[threadIdx.x];
-    FlagT    tail_flag   = d_tail_flags[threadIdx.x];
-    FlagT    head_flag   = (threadIdx.x == 0) ?
-                            0 :
-                            d_tail_flags[threadIdx.x - 1];
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Test segmented warp reduce
-    int warp_id = threadIdx.x / LOGICAL_WARP_THREADS;
-    T output = DeviceTest<T, ReductionOp, WarpReduce>::TailSegmentedReduce(
-        temp_storage[warp_id], input, tail_flag, reduction_op);
-
-    // Record elapsed clocks
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    *d_elapsed = stop - start;
-
-    // Store aggregate
-    d_out[threadIdx.x] = ((threadIdx.x % LOGICAL_WARP_THREADS == 0) || head_flag) ?
-        output :
-        input;
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize reduction problem (and solution)
- */
-template <
-    typename    T,
-    typename    ReductionOp>
-void Initialize(
-    GenMode     gen_mode,
-    int         flag_entropy,
-    T           *h_in,
-    int         *h_flags,
-    int         warps,
-    int         warp_threads,
-    int         valid_warp_threads,
-    ReductionOp reduction_op,
-    T           *h_head_out,
-    T           *h_tail_out)
-{
-    for (int i = 0; i < warps * warp_threads; ++i)
-    {
-        // Sample a value for this item
-        InitValue(gen_mode, h_in[i], i);
-        h_head_out[i] = h_in[i];
-        h_tail_out[i] = h_in[i];
-
-        // Sample whether or not this item will be a segment head
-        char bits;
-        RandomBits(bits, flag_entropy);
-        h_flags[i] = bits & 0x1;
-    }
-
-    // Accumulate segments (lane 0 of each warp is implicitly a segment head)
-    for (int warp = 0; warp < warps; ++warp)
-    {
-        int warp_offset  = warp * warp_threads;
-        int item_offset = warp_offset + valid_warp_threads - 1;
-
-        // Last item in warp
-        T head_aggregate = h_in[item_offset];
-        T tail_aggregate = h_in[item_offset];
-
-        if (h_flags[item_offset])
-            h_head_out[item_offset] = head_aggregate;
-        item_offset--;
-
-        // Work backwards
-        while (item_offset >= warp_offset)
-        {
-            if (h_flags[item_offset + 1])
-            {
-                head_aggregate = h_in[item_offset];
-            }
-            else
-            {
-                head_aggregate = reduction_op(head_aggregate, h_in[item_offset]);
-            }
-
-            if (h_flags[item_offset])
-            {
-                h_head_out[item_offset] = head_aggregate;
-                h_tail_out[item_offset + 1] = tail_aggregate;
-                tail_aggregate = h_in[item_offset];
-            }
-            else
-            {
-                tail_aggregate = reduction_op(tail_aggregate, h_in[item_offset]);
-            }
-
-            item_offset--;
-        }
-
-        // Record last segment head_aggregate to head offset
-        h_head_out[warp_offset] = head_aggregate;
-        h_tail_out[warp_offset] = tail_aggregate;
-    }
-}
-
-
-/**
- * Test warp reduction
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void TestReduce(
-    GenMode     gen_mode,
-    ReductionOp reduction_op,
-    int         valid_warp_threads = LOGICAL_WARP_THREADS)
-{
-    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
-
-    // Allocate host arrays
-    T   *h_in           = new T[BLOCK_THREADS];
-    int *h_flags        = new int[BLOCK_THREADS];
-    T   *h_out          = new T[BLOCK_THREADS];
-    T   *h_tail_out     = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    Initialize(gen_mode, -1, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, valid_warp_threads, reduction_op, h_out, h_tail_out);
-
-    // Initialize/clear device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    clock_t *d_elapsed = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * BLOCK_THREADS));
-
-    if (g_verbose)
-    {
-        printf("Data:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), valid_warp_threads);
-    }
-
-    // Run kernel
-    printf("\nGen-mode %d, %d warps, %d warp threads, %d valid lanes, %s (%d bytes) elements:\n",
-        gen_mode,
-        WARPS,
-        LOGICAL_WARP_THREADS,
-        valid_warp_threads,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    if (valid_warp_threads == LOGICAL_WARP_THREADS)
-    {
-        // Run full-warp kernel
-        FullWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            reduction_op,
-            d_elapsed);
-    }
-    else
-    {
-        // Run partial-warp kernel
-        PartialWarpReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-            d_in,
-            d_out,
-            reduction_op,
-            d_elapsed,
-            valid_warp_threads);
-    }
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tReduction results: ");
-    int compare = CompareDeviceResults(h_out, d_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_flags) delete[] h_flags;
-    if (h_out) delete[] h_out;
-    if (h_tail_out) delete[] h_tail_out;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Test warp segmented reduction
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void TestSegmentedReduce(
-    GenMode     gen_mode,
-    int         flag_entropy,
-    ReductionOp reduction_op)
-{
-    const int BLOCK_THREADS = LOGICAL_WARP_THREADS * WARPS;
-
-    // Allocate host arrays
-    int compare;
-    T   *h_in           = new T[BLOCK_THREADS];
-    int *h_flags        = new int[BLOCK_THREADS];
-    T   *h_head_out     = new T[BLOCK_THREADS];
-    T   *h_tail_out     = new T[BLOCK_THREADS];
-
-    // Initialize problem
-    Initialize(gen_mode, flag_entropy, h_in, h_flags, WARPS, LOGICAL_WARP_THREADS, LOGICAL_WARP_THREADS, reduction_op, h_head_out, h_tail_out);
-
-    // Initialize/clear device arrays
-    T           *d_in = NULL;
-    int         *d_flags = NULL;
-    T           *d_head_out = NULL;
-    T           *d_tail_out = NULL;
-    clock_t     *d_elapsed = NULL;
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(int) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_head_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_tail_out, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(int) * BLOCK_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_head_out, 0, sizeof(T) * BLOCK_THREADS));
-    CubDebugExit(cudaMemset(d_tail_out, 0, sizeof(T) * BLOCK_THREADS));
-
-    if (g_verbose)
-    {
-        printf("Data:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_in + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
-
-        printf("\nFlags:\n");
-        for (int i = 0; i < WARPS; ++i)
-            DisplayResults(h_flags + (i * LOGICAL_WARP_THREADS), LOGICAL_WARP_THREADS);
-    }
-
-    printf("\nGen-mode %d, head flag entropy reduction %d, %d warps, %d warp threads, %s (%d bytes) elements:\n",
-        gen_mode,
-        flag_entropy,
-        WARPS,
-        LOGICAL_WARP_THREADS,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run head-based kernel
-    WarpHeadSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-        d_in,
-        d_flags,
-        d_head_out,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tHead-based segmented reduction results: ");
-    compare = CompareDeviceResults(h_head_out, d_head_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Run tail-based kernel
-    WarpTailSegmentedReduceKernel<WARPS, LOGICAL_WARP_THREADS><<<1, BLOCK_THREADS>>>(
-        d_in,
-        d_flags,
-        d_tail_out,
-        reduction_op,
-        d_elapsed);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tTail-based segmented reduction results: ");
-    compare = CompareDeviceResults(h_tail_out, d_tail_out, BLOCK_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_flags) delete[] h_flags;
-    if (h_head_out) delete[] h_head_out;
-    if (h_tail_out) delete[] h_tail_out;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags));
-    if (d_head_out) CubDebugExit(g_allocator.DeviceFree(d_head_out));
-    if (d_tail_out) CubDebugExit(g_allocator.DeviceFree(d_tail_out));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Run battery of tests for different full and partial tile sizes
- */
-template <
-    int         WARPS,
-    int         LOGICAL_WARP_THREADS,
-    typename    T,
-    typename    ReductionOp>
-void Test(
-    GenMode     gen_mode,
-    ReductionOp reduction_op)
-{
-    // Partial tiles
-    for (
-        int valid_warp_threads = 1;
-        valid_warp_threads < LOGICAL_WARP_THREADS;
-        valid_warp_threads += CUB_MAX(1, LOGICAL_WARP_THREADS / 5))
-    {
-        // Without wrapper (to test non-excepting PTX POD-op specializations)
-        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, valid_warp_threads);
-
-        // With wrapper to ensure no ops called on OOB lanes
-        WrapperFunctor<ReductionOp, LOGICAL_WARP_THREADS> wrapped_op(reduction_op, valid_warp_threads);
-        TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, wrapped_op, valid_warp_threads);
-    }
-
-    // Full tile
-    TestReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, reduction_op, LOGICAL_WARP_THREADS);
-
-    // Segmented reduction with different head flags
-    for (int flag_entropy = 0; flag_entropy < 10; ++flag_entropy)
-    {
-        TestSegmentedReduce<WARPS, LOGICAL_WARP_THREADS, T>(gen_mode, flag_entropy, reduction_op);
-    }
-}
-
-
-/**
- * Run battery of tests for different data types and reduce ops
- */
-template <
-    int WARPS,
-    int LOGICAL_WARP_THREADS>
-void Test(GenMode gen_mode)
-{
-    // primitive
-    Test<WARPS, LOGICAL_WARP_THREADS, char>(                gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, short>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, int>(                 gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, long long>(           gen_mode, Sum());
-
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Sum());
-
-    if (gen_mode != RANDOM)
-    {
-        Test<WARPS, LOGICAL_WARP_THREADS, float>(           gen_mode, Sum());
-        Test<WARPS, LOGICAL_WARP_THREADS, double>(          gen_mode, Sum());
-    }
-
-    // primitive (alternative reduce op)
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned char>(       gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned short>(      gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned int>(        gen_mode, Max());
-    Test<WARPS, LOGICAL_WARP_THREADS, unsigned long long>(  gen_mode, Max());
-
-    // vec-1
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar1>(              gen_mode, Sum());
-
-    // vec-2
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar2>(              gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ushort2>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, uint2>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong2>(          gen_mode, Sum());
-
-    // vec-4
-    Test<WARPS, LOGICAL_WARP_THREADS, uchar4>(              gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ushort4>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, uint4>(               gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, ulonglong4>(          gen_mode, Sum());
-
-    // complex
-    Test<WARPS, LOGICAL_WARP_THREADS, TestFoo>(             gen_mode, Sum());
-    Test<WARPS, LOGICAL_WARP_THREADS, TestBar>(             gen_mode, Sum());
-}
-
-
-/**
- * Run battery of tests for different problem generation options
- */
-template <
-    int WARPS,
-    int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<WARPS, LOGICAL_WARP_THREADS>(UNIFORM);
-    Test<WARPS, LOGICAL_WARP_THREADS>(INTEGER_SEED);
-    Test<WARPS, LOGICAL_WARP_THREADS>(RANDOM);
-}
-
-
-/**
- * Run battery of tests for different number of active warps
- */
-template <int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<1, LOGICAL_WARP_THREADS>();
-
-    // Only power-of-two subwarps can be tiled
-    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
-        Test<2, LOGICAL_WARP_THREADS>();
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    TestReduce<1, 32, int>(UNIFORM, Sum());
-
-    TestReduce<1, 32, double>(UNIFORM, Sum());
-    TestReduce<2, 16, TestBar>(UNIFORM, Sum());
-    TestSegmentedReduce<1, 32, int>(UNIFORM, 1, Sum());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test logical warp sizes
-        Test<32>();
-        Test<16>();
-        Test<9>();
-        Test<7>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/external/cub/test/test_warp_scan.cu b/external/cub/test/test_warp_scan.cu
deleted file mode 100644
index 69a60113495..00000000000
--- a/external/cub/test/test_warp_scan.cu
+++ /dev/null
@@ -1,630 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Test of WarpScan utilities
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <stdio.h>
-#include <typeinfo>
-
-#include <cub/warp/warp_scan.cuh>
-#include <cub/util_allocator.cuh>
-
-#include "test_util.h"
-
-using namespace cub;
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-bool                    g_verbose       = false;
-int                     g_repeat        = 0;
-CachingDeviceAllocator  g_allocator(true);
-
-
-/**
- * Primitive variant to test
- */
-enum TestMode
-{
-    BASIC,
-    AGGREGATE,
-};
-
-
-
-/**
- * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants)
- */
-template<typename OpT>
-struct WrapperFunctor
-{
-    OpT op;
-
-    WrapperFunctor(OpT op) : op(op) {}
-
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return op(a, b);
-    }
-};
-
-//---------------------------------------------------------------------
-// Test kernels
-//---------------------------------------------------------------------
-
-/// Exclusive scan basic
-template <typename WarpScanT, typename T, typename ScanOpT, typename IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.ExclusiveScan(data, data, initial_value, scan_op);
-}
-
-/// Exclusive scan aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.ExclusiveScan(data, data, initial_value, scan_op, aggregate);
-}
-
-
-/// Exclusive sum basic
-template <
-    typename    WarpScanT,
-    typename    T>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.ExclusiveSum(data, data);
-}
-
-
-/// Exclusive sum aggregate
-template <
-    typename    WarpScanT,
-    typename    T>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    T                               &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.ExclusiveSum(data, data, aggregate);
-}
-
-
-/// Inclusive scan basic
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.InclusiveScan(data, data, scan_op);
-}
-
-/// Inclusive scan aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    ScanOpT,
-    typename    IsPrimitiveT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    ScanOpT                         &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    IsPrimitiveT                    is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.InclusiveScan(data, data, scan_op, aggregate);
-}
-
-/// Inclusive sum basic
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    InitialValueT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<BASIC>                 test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test basic warp scan
-    warp_scan.InclusiveSum(data, data);
-}
-
-/// Inclusive sum aggregate
-template <
-    typename    WarpScanT,
-    typename    T,
-    typename    InitialValueT>
-__device__ __forceinline__ void DeviceTest(
-    WarpScanT                       &warp_scan,
-    T                               &data,
-    NullType                        &initial_value,
-    Sum                             &scan_op,
-    T                               &aggregate,
-    Int2Type<AGGREGATE>             test_mode,
-    Int2Type<true>                  is_primitive)
-{
-    // Test with cumulative aggregate
-    warp_scan.InclusiveSum(data, data, aggregate);
-}
-
-
-/**
- * WarpScan test kernel
- */
-template <
-    int         LOGICAL_WARP_THREADS,
-    TestMode    TEST_MODE,
-    typename    T,
-    typename    ScanOpT,
-    typename    InitialValueT>
-__global__ void WarpScanKernel(
-    T               *d_in,
-    T               *d_out,
-    T               *d_aggregate,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value,
-    clock_t         *d_elapsed)
-{
-    // Cooperative warp-scan utility type (1 warp)
-    typedef WarpScan<T, LOGICAL_WARP_THREADS> WarpScanT;
-
-    // Allocate temp storage in shared memory
-    __shared__ typename WarpScanT::TempStorage temp_storage;
-
-    // Per-thread tile data
-    T data = d_in[threadIdx.x];
-
-    // Start cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t start = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    T aggregate;
-
-    // Test scan
-    WarpScanT warp_scan(temp_storage);
-    DeviceTest(
-        warp_scan,
-        data,
-        initial_value,
-        scan_op,
-        aggregate,
-        Int2Type<TEST_MODE>(),
-        Int2Type<Traits<T>::PRIMITIVE>());
-
-    // Stop cycle timer
-    __threadfence_block();      // workaround to prevent clock hoisting
-    clock_t stop = clock();
-    __threadfence_block();      // workaround to prevent clock hoisting
-
-    // Store data
-    d_out[threadIdx.x] = data;
-
-    if (TEST_MODE != BASIC)
-    {
-        // Store aggregate
-        d_aggregate[threadIdx.x] = aggregate;
-    }
-
-    // Store time
-    if (threadIdx.x == 0)
-    {
-        *d_elapsed = (start > stop) ? start - stop : stop - start;
-    }
-}
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize exclusive-scan problem (and solution)
- */
-template <
-    typename        T,
-    typename        ScanOpT>
-T Initialize(
-    GenMode         gen_mode,
-    T               *h_in,
-    T               *h_reference,
-    int             num_items,
-    ScanOpT         scan_op,
-    T               initial_value)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    h_reference[0]      = initial_value;
-    T inclusive         = scan_op(initial_value, h_in[0]);
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        h_reference[i] = inclusive;
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Initialize inclusive-scan problem (and solution)
- */
-template <
-    typename    T,
-    typename    ScanOpT>
-T Initialize(
-    GenMode     gen_mode,
-    T           *h_in,
-    T           *h_reference,
-    int         num_items,
-    ScanOpT     scan_op,
-    NullType)
-{
-    InitValue(gen_mode, h_in[0], 0);
-
-    T block_aggregate   = h_in[0];
-    T inclusive         = h_in[0];
-    h_reference[0]      = inclusive;
-
-    for (int i = 1; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-        inclusive = scan_op(inclusive, h_in[i]);
-        block_aggregate = scan_op(block_aggregate, h_in[i]);
-        h_reference[i] = inclusive;
-    }
-
-    return block_aggregate;
-}
-
-
-/**
- * Test warp scan
- */
-template <
-    int             LOGICAL_WARP_THREADS,
-    TestMode        TEST_MODE,
-    typename        T,
-    typename        ScanOpT,
-    typename        InitialValueT>        // NullType implies inclusive-scan, otherwise inclusive scan
-void Test(
-    GenMode         gen_mode,
-    ScanOpT         scan_op,
-    InitialValueT   initial_value)
-{
-    // Allocate host arrays
-    T *h_in = new T[LOGICAL_WARP_THREADS];
-    T *h_reference = new T[LOGICAL_WARP_THREADS];
-    T *h_aggregate = new T[LOGICAL_WARP_THREADS];
-
-    // Initialize problem
-    T aggregate = Initialize(
-        gen_mode,
-        h_in,
-        h_reference,
-        LOGICAL_WARP_THREADS,
-        scan_op,
-        initial_value);
-
-    if (g_verbose)
-    {
-        printf("Input: \n");
-        DisplayResults(h_in, LOGICAL_WARP_THREADS);
-        printf("\n");
-    }
-
-    for (int i = 0; i < LOGICAL_WARP_THREADS; ++i)
-    {
-        h_aggregate[i] = aggregate;
-    }
-
-    // Initialize/clear device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    T *d_aggregate = NULL;
-    clock_t *d_elapsed = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * LOGICAL_WARP_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (LOGICAL_WARP_THREADS + 1)));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * LOGICAL_WARP_THREADS));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(clock_t)));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * LOGICAL_WARP_THREADS, cudaMemcpyHostToDevice));
-    CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (LOGICAL_WARP_THREADS + 1)));
-    CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * LOGICAL_WARP_THREADS));
-
-    // Run kernel
-    printf("Test-mode %d (%s), gen-mode %d (%s), %s warpscan, %d warp threads, %s (%d bytes) elements:\n",
-        TEST_MODE, typeid(TEST_MODE).name(),
-        gen_mode, typeid(gen_mode).name(),
-        (Equals<InitialValueT, NullType>::VALUE) ? "Inclusive" : "Exclusive",
-        LOGICAL_WARP_THREADS,
-        typeid(T).name(),
-        (int) sizeof(T));
-    fflush(stdout);
-
-    // Run aggregate/prefix kernel
-    WarpScanKernel<LOGICAL_WARP_THREADS, TEST_MODE><<<1, LOGICAL_WARP_THREADS>>>(
-        d_in,
-        d_out,
-        d_aggregate,
-        scan_op,
-        initial_value,
-        d_elapsed);
-
-    printf("\tElapsed clocks: ");
-    DisplayDeviceResults(d_elapsed, 1);
-
-    CubDebugExit(cudaPeekAtLastError());
-    CubDebugExit(cudaDeviceSynchronize());
-
-    // Copy out and display results
-    printf("\tScan results: ");
-    int compare = CompareDeviceResults(h_reference, d_out, LOGICAL_WARP_THREADS, g_verbose, g_verbose);
-    printf("%s\n", compare ? "FAIL" : "PASS");
-    AssertEquals(0, compare);
-
-    // Copy out and display aggregate
-    if (TEST_MODE == AGGREGATE)
-    {
-        printf("\tScan aggregate: ");
-        compare = CompareDeviceResults(h_aggregate, d_aggregate, LOGICAL_WARP_THREADS, g_verbose, g_verbose);
-        printf("%s\n", compare ? "FAIL" : "PASS");
-        AssertEquals(0, compare);
-    }
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (h_reference) delete[] h_reference;
-    if (h_aggregate) delete[] h_aggregate;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-    if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate));
-    if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed));
-}
-
-
-/**
- * Run battery of tests for different primitive variants
- */
-template <
-    int         LOGICAL_WARP_THREADS,
-    typename    ScanOpT,
-    typename    T>
-void Test(
-    GenMode     gen_mode,
-    ScanOpT     scan_op,
-    T           initial_value)
-{
-    // Exclusive
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, T());
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, T());
-
-    // Exclusive (non-specialized, so we can use initial-value)
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, WrapperFunctor<ScanOpT>(scan_op), initial_value);
-
-    // Inclusive
-    Test<LOGICAL_WARP_THREADS, BASIC, T>(gen_mode, scan_op, NullType());
-    Test<LOGICAL_WARP_THREADS, AGGREGATE, T>(gen_mode, scan_op, NullType());
-}
-
-
-/**
- * Run battery of tests for different data types and scan ops
- */
-template <int LOGICAL_WARP_THREADS>
-void Test(GenMode gen_mode)
-{
-    // Get device ordinal
-    int device_ordinal;
-    CubDebugExit(cudaGetDevice(&device_ordinal));
-
-    // Get ptx version
-    int ptx_version;
-    CubDebugExit(PtxVersion(ptx_version));
-
-    // primitive
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (char) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (short) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (int) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (long long) 99);
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (float) 99);
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), (double) 99);
-    }
-
-    // primitive (alternative scan op)
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned char) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned short) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned int) 99);
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Max(), (unsigned long long) 99);
-
-    // vec-2
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uchar2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ushort2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_uint2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulong2(17, 21));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_ulonglong2(17, 21));
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float2(17, 21));
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double2(17, 21));
-    }
-
-    // vec-4
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_char4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_short4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_int4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_long4(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_longlong4(17, 21, 32, 85));
-    if (gen_mode != RANDOM) {
-        // Only test numerically stable inputs
-        Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_float4(17, 21, 32, 85));
-        if (ptx_version > 100)
-            Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), make_double4(17, 21, 32, 85));
-    }
-
-    // complex
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85));
-    Test<LOGICAL_WARP_THREADS>(gen_mode, Sum(), TestBar(17, 21));
-
-}
-
-
-/**
- * Run battery of tests for different problem generation options
- */
-template <int LOGICAL_WARP_THREADS>
-void Test()
-{
-    Test<LOGICAL_WARP_THREADS>(UNIFORM);
-    Test<LOGICAL_WARP_THREADS>(INTEGER_SEED);
-    Test<LOGICAL_WARP_THREADS>(RANDOM);
-}
-
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    g_verbose = args.CheckCmdLineFlag("v");
-    args.GetCmdLineArgument("repeat", g_repeat);
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--repeat=<repetitions of entire test suite>]"
-            "[--v] "
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#ifdef QUICK_TEST
-
-    // Compile/run quick tests
-    Test<32, AGGREGATE, int>(UNIFORM, Sum(), (int) 0);
-    Test<32, AGGREGATE, float>(UNIFORM, Sum(), (float) 0);
-    Test<32, AGGREGATE, long long>(UNIFORM, Sum(), (long long) 0);
-    Test<32, AGGREGATE, double>(UNIFORM, Sum(), (double) 0);
-
-    typedef KeyValuePair<int, float> T;
-    cub::Sum sum_op;
-    Test<32, AGGREGATE, T>(UNIFORM, ReduceBySegmentOp<cub::Sum>(sum_op), T());
-
-#else
-
-    // Compile/run thorough tests
-    for (int i = 0; i <= g_repeat; ++i)
-    {
-        // Test logical warp sizes
-        Test<32>();
-        Test<16>();
-        Test<9>();
-        Test<7>();
-    }
-
-#endif
-
-    return 0;
-}
-
-
-
-
diff --git a/external/cub/tune/Makefile b/external/cub/tune/Makefile
deleted file mode 100644
index cf55efa3001..00000000000
--- a/external/cub/tune/Makefile
+++ /dev/null
@@ -1,192 +0,0 @@
-#/******************************************************************************
-# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
-# * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
-# * 
-# * Redistribution and use in source and binary forms, with or without
-# * modification, are permitted provided that the following conditions are met:
-# *	 * Redistributions of source code must retain the above copyright
-# *	   notice, this list of conditions and the following disclaimer.
-# *	 * Redistributions in binary form must reproduce the above copyright
-# *	   notice, this list of conditions and the following disclaimer in the
-# *	   documentation and/or other materials provided with the distribution.
-# *	 * Neither the name of the NVIDIA CORPORATION nor the
-# *	   names of its contributors may be used to endorse or promote products
-# *	   derived from this software without specific prior written permission.
-# * 
-# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# *
-#******************************************************************************/
- 
-#-------------------------------------------------------------------------------
-# Build script for project
-#-------------------------------------------------------------------------------
-
-NVCC = "$(shell which nvcc)"
-NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
-
-# detect OS
-OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
-
-#-------------------------------------------------------------------------------
-# Libs
-#-------------------------------------------------------------------------------
-
-
-#-------------------------------------------------------------------------------
-# Includes
-#-------------------------------------------------------------------------------
-
-INC = -I. -I.. -I../test
-
-#-------------------------------------------------------------------------------
-# Libs
-#-------------------------------------------------------------------------------
-
-LIBS += -lcudart 
-
-#-------------------------------------------------------------------------------
-# Defines
-#-------------------------------------------------------------------------------
-
-DEFINES = 
-
-#-------------------------------------------------------------------------------
-# SM Arch
-#-------------------------------------------------------------------------------
-
-ifdef sm
-	SM_ARCH = $(sm)
-else 
-    SM_ARCH = 200
-endif
-
-# Only one arch per tuning binary
-ifeq (350, $(findstring 350, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_35
-    SM_ARCH = 350
-endif
-ifeq (300, $(findstring 300, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_30
-    SM_ARCH = 300
-endif
-ifeq (200, $(findstring 200, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_20
-    SM_ARCH = 200
-endif
-ifeq (130, $(findstring 130, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_13
-    SM_ARCH = 130
-endif
-ifeq (110, $(findstring 110, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_11 
-    SM_ARCH = 110
-endif
-ifeq (100, $(findstring 100, $(SM_ARCH)))
-    SM_TARGETS = -arch=sm_10 
-    SM_ARCH = 100
-endif
-
-
-#-------------------------------------------------------------------------------
-# Compiler Flags
-#-------------------------------------------------------------------------------
-
-NVCCFLAGS = -Xptxas -v -Xcudafe -\#
-
-# Help the compiler/linker work with huge numbers of kernels on Windows
-ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
-	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
-endif
-
-# 32/64-bit (32-bit device pointers by default) 
-ifeq ($(force32), 1)
-	CPU_ARCH = -m32
-	CPU_ARCH_SUFFIX = i386
-else
-	CPU_ARCH = -m64
-	CPU_ARCH_SUFFIX = x86_64
-endif
-
-# CUDA ABI enable/disable (enabled by default) 
-ifneq ($(abi), 0)
-	ABI_SUFFIX = abi
-else 
-	NVCCFLAGS += -Xptxas -abi=no
-	ABI_SUFFIX = noabi
-endif
-
-# NVVM/Open64 middle-end compiler (nvvm by default)
-ifeq ($(open64), 1)
-	NVCCFLAGS += -open64
-	PTX_SUFFIX = open64
-else 
-	PTX_SUFFIX = nvvm
-endif
-
-# Verbose toolchain output from nvcc
-ifeq ($(verbose), 1)
-	NVCCFLAGS += -v
-endif
-
-# Keep intermediate compilation artifacts
-ifeq ($(keep), 1)
-	NVCCFLAGS += -keep
-endif
-
-# Data type size to compile a schmoo binary for
-ifdef tunesize
-    TUNE_SIZE = $(tunesize)
-else 
-	TUNE_SIZE = 4
-endif
-
-
-SUFFIX = $(TUNE_SIZE)B_sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CPU_ARCH_SUFFIX)
-
-#-------------------------------------------------------------------------------
-# Dependency Lists
-#-------------------------------------------------------------------------------
-
-rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
-
-DEPS =	 ./Makefile \
-		../test/test_util.h \
-		$(call rwildcard,../cub/,*.cuh)
-
-
-#-------------------------------------------------------------------------------
-# make default
-#-------------------------------------------------------------------------------
-
-default:
-
-
-#-------------------------------------------------------------------------------
-# make clean
-#-------------------------------------------------------------------------------
-
-clean :
-	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
-	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
-
-
-
-#-------------------------------------------------------------------------------
-# make tune_device_reduce
-#-------------------------------------------------------------------------------
-
-tune_device_reduce: bin/tune_device_reduce_$(SUFFIX)
-
-bin/tune_device_reduce_$(SUFFIX) : tune_device_reduce.cu $(DEPS)
-	mkdir -p bin
-	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/tune_device_reduce_$(SUFFIX) tune_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE)
-
diff --git a/external/cub/tune/tune_device_reduce.cu b/external/cub/tune/tune_device_reduce.cu
deleted file mode 100644
index 090e763ce29..00000000000
--- a/external/cub/tune/tune_device_reduce.cu
+++ /dev/null
@@ -1,763 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2017, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Evaluates different tuning configurations of DeviceReduce.
- *
- * The best way to use this program:
- * (1) Find the best all-around single-block tune for a given arch.
- *     For example, 100 samples [1 ..512], 100 timing iterations per config per sample:
- *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --n=512 --single --device=0
- * (2) Update the single tune in device_reduce.cuh
- * (3) Find the best all-around multi-block tune for a given arch.
- *     For example, 100 samples [single-block tile-size ..  50,331,648], 100 timing iterations per config per sample:
- *         ./bin/tune_device_reduce_sm200_nvvm_5.0_abi_i386 --i=100 --s=100 --device=0
- * (4) Update the multi-block tune in device_reduce.cuh
- *
- ******************************************************************************/
-
-// Ensure printing of CUDA runtime errors to console
-#define CUB_STDERR
-
-#include <vector>
-#include <algorithm>
-#include <stdio.h>
-#include <cub/cub.cuh>
-#include "../test/test_util.h"
-
-using namespace cub;
-using namespace std;
-
-
-//---------------------------------------------------------------------
-// Globals, constants and typedefs
-//---------------------------------------------------------------------
-
-#ifndef TUNE_ARCH
-#define TUNE_ARCH 100
-#endif
-
-int     g_max_items         = 48 * 1024 * 1024;
-int     g_samples           = 100;
-int     g_timing_iterations        = 2;
-bool    g_verbose           = false;
-bool    g_single            = false;
-bool    g_verify            = true;
-CachingDeviceAllocator  g_allocator;
-
-
-//---------------------------------------------------------------------
-// Host utility subroutines
-//---------------------------------------------------------------------
-
-/**
- * Initialize problem
- */
-template <typename T>
-void Initialize(
-    GenMode         gen_mode,
-    T               *h_in,
-    int             num_items)
-{
-    for (int i = 0; i < num_items; ++i)
-    {
-        InitValue(gen_mode, h_in[i], i);
-    }
-}
-
-/**
- * Sequential reduction
- */
-template <typename T, typename ReductionOp>
-T Reduce(
-    T               *h_in,
-    ReductionOp     reduction_op,
-    int             num_items)
-{
-    T retval = h_in[0];
-    for (int i = 1; i < num_items; ++i)
-        retval = reduction_op(retval, h_in[i]);
-
-    return retval;
-}
-
-
-
-//---------------------------------------------------------------------
-// Full tile test generation
-//---------------------------------------------------------------------
-
-
-
-/**
- * Wrapper structure for generating and running different tuning configurations
- */
-template <
-    typename T,
-    typename OffsetT,
-    typename ReductionOp>
-struct Schmoo
-{
-    //---------------------------------------------------------------------
-    // Types
-    //---------------------------------------------------------------------
-
-    /// Pairing of kernel function pointer and corresponding dispatch params
-    template <typename KernelPtr>
-    struct DispatchTuple
-    {
-        KernelPtr                           kernel_ptr;
-        DeviceReduce::KernelDispachParams   params;
-
-        float                               avg_throughput;
-        float                               best_avg_throughput;
-        OffsetT                              best_size;
-        float                               hmean_speedup;
-
-
-        DispatchTuple() :
-            kernel_ptr(0),
-            params(DeviceReduce::KernelDispachParams()),
-            avg_throughput(0.0),
-            best_avg_throughput(0.0),
-            hmean_speedup(0.0),
-            best_size(0)
-        {}
-    };
-
-    /**
-     * Comparison operator for DispatchTuple.avg_throughput
-     */
-    template <typename Tuple>
-    static bool MinSpeedup(const Tuple &a, const Tuple &b)
-    {
-        float delta = a.hmean_speedup - b.hmean_speedup;
-
-        return ((delta < 0.02) && (delta > -0.02)) ?
-            (a.best_avg_throughput < b.best_avg_throughput) :       // Negligible average performance differences: defer to best performance
-            (a.hmean_speedup < b.hmean_speedup);
-    }
-
-
-
-    /// Multi-block reduction kernel type and dispatch tuple type
-    typedef void (*MultiBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, GridEvenShare<OffsetT>, GridQueue<OffsetT>, ReductionOp);
-    typedef DispatchTuple<MultiBlockDeviceReduceKernelPtr> MultiDispatchTuple;
-
-    /// Single-block reduction kernel type and dispatch tuple type
-    typedef void (*SingleBlockDeviceReduceKernelPtr)(T*, T*, OffsetT, ReductionOp);
-    typedef DispatchTuple<SingleBlockDeviceReduceKernelPtr> SingleDispatchTuple;
-
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    vector<MultiDispatchTuple> multi_kernels;       // List of generated multi-block kernels
-    vector<SingleDispatchTuple> single_kernels;     // List of generated single-block kernels
-
-
-    //---------------------------------------------------------------------
-    // Kernel enumeration methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Must have smem that fits in the SM
-     * Must have vector load length that divides items per thread
-     */
-    template <typename TilesReducePolicy, typename ReductionOp>
-    struct SmemSize
-    {
-        enum
-        {
-            BYTES = sizeof(typename BlockReduceTiles<TilesReducePolicy, T*, OffsetT, ReductionOp>::TempStorage),
-            IS_OK = ((BYTES < ArchProps<TUNE_ARCH>::SMEM_BYTES) &&
-                     (TilesReducePolicy::ITEMS_PER_THREAD % TilesReducePolicy::VECTOR_LOAD_LENGTH == 0))
-        };
-    };
-
-
-    /**
-     * Specialization that allows kernel generation with the specified TilesReducePolicy
-     */
-    template <
-        typename    TilesReducePolicy,
-        bool        IsOk = SmemSize<TilesReducePolicy, ReductionOp>::IS_OK>
-    struct Ok
-    {
-        /// Enumerate multi-block kernel and add to the list
-        template <typename KernelsVector>
-        static void GenerateMulti(
-            KernelsVector &multi_kernels,
-            int subscription_factor)
-        {
-            MultiDispatchTuple tuple;
-            tuple.params.template Init<TilesReducePolicy>(subscription_factor);
-            tuple.kernel_ptr = ReducePrivatizedKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
-            multi_kernels.push_back(tuple);
-        }
-
-
-        /// Enumerate single-block kernel and add to the list
-        template <typename KernelsVector>
-        static void GenerateSingle(KernelsVector &single_kernels)
-        {
-            SingleDispatchTuple tuple;
-            tuple.params.template Init<TilesReducePolicy>();
-            tuple.kernel_ptr = ReduceSingleKernel<TilesReducePolicy, T*, T*, OffsetT, ReductionOp>;
-            single_kernels.push_back(tuple);
-        }
-    };
-
-    /**
-     * Specialization that rejects kernel generation with the specified TilesReducePolicy
-     */
-    template <typename TilesReducePolicy>
-    struct Ok<TilesReducePolicy, false>
-    {
-        template <typename KernelsVector>
-        static void GenerateMulti(KernelsVector &multi_kernels, int subscription_factor) {}
-
-        template <typename KernelsVector>
-        static void GenerateSingle(KernelsVector &single_kernels) {}
-    };
-
-
-    /// Enumerate block-scheduling variations
-    template <
-        int                     BLOCK_THREADS,
-        int                     ITEMS_PER_THREAD,
-        int                     VECTOR_LOAD_LENGTH,
-        BlockReduceAlgorithm    BLOCK_ALGORITHM,
-        CacheLoadModifier      LOAD_MODIFIER>
-    void Enumerate()
-    {
-        // Multi-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 1);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 2);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 4);
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateMulti(multi_kernels, 8);
-#if TUNE_ARCH >= 200
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_DYNAMIC> >::GenerateMulti(multi_kernels, 1);
-#endif
-
-        // Single-block kernels
-        Ok<BlockReduceTilesPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_MODIFIER, GRID_MAPPING_RAKE> >::GenerateSingle(single_kernels);
-    }
-
-
-    /// Enumerate load modifier variations
-    template <
-        int                     BLOCK_THREADS,
-        int                     ITEMS_PER_THREAD,
-        int                     VECTOR_LOAD_LENGTH,
-        BlockReduceAlgorithm    BLOCK_ALGORITHM>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_DEFAULT>();
-#if TUNE_ARCH >= 350
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_ALGORITHM, LOAD_LDG>();
-#endif
-    }
-
-
-    /// Enumerate block algorithms
-    template <
-        int BLOCK_THREADS,
-        int ITEMS_PER_THREAD,
-        int VECTOR_LOAD_LENGTH>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_RAKING>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, VECTOR_LOAD_LENGTH, BLOCK_REDUCE_WARP_REDUCTIONS>();
-    }
-
-
-    /// Enumerate vectorization variations
-    template <
-        int BLOCK_THREADS,
-        int ITEMS_PER_THREAD>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 1>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 2>();
-        Enumerate<BLOCK_THREADS, ITEMS_PER_THREAD, 4>();
-    }
-
-
-    /// Enumerate thread-granularity variations
-    template <int BLOCK_THREADS>
-    void Enumerate()
-    {
-        Enumerate<BLOCK_THREADS, 7>();
-        Enumerate<BLOCK_THREADS, 8>();
-        Enumerate<BLOCK_THREADS, 9>();
-
-        Enumerate<BLOCK_THREADS, 11>();
-        Enumerate<BLOCK_THREADS, 12>();
-        Enumerate<BLOCK_THREADS, 13>();
-
-        Enumerate<BLOCK_THREADS, 15>();
-        Enumerate<BLOCK_THREADS, 16>();
-        Enumerate<BLOCK_THREADS, 17>();
-
-        Enumerate<BLOCK_THREADS, 19>();
-        Enumerate<BLOCK_THREADS, 20>();
-        Enumerate<BLOCK_THREADS, 21>();
-
-        Enumerate<BLOCK_THREADS, 23>();
-        Enumerate<BLOCK_THREADS, 24>();
-        Enumerate<BLOCK_THREADS, 25>();
-    }
-
-
-    /// Enumerate block size variations
-    void Enumerate()
-    {
-        printf("\nEnumerating kernels\n"); fflush(stdout);
-
-        Enumerate<32>();
-        Enumerate<64>();
-        Enumerate<96>();
-        Enumerate<128>();
-        Enumerate<160>();
-        Enumerate<192>();
-        Enumerate<256>();
-        Enumerate<512>();
-    }
-
-
-    //---------------------------------------------------------------------
-    // Test methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Test a configuration
-     */
-    void TestConfiguration(
-        MultiDispatchTuple      &multi_dispatch,
-        SingleDispatchTuple     &single_dispatch,
-        T*                      d_in,
-        T*                      d_out,
-        T*                      h_reference,
-        OffsetT                  num_items,
-        ReductionOp             reduction_op)
-    {
-        // Clear output
-        if (g_verify) CubDebugExit(cudaMemset(d_out, 0, sizeof(T)));
-
-        // Allocate temporary storage
-        void            *d_temp_storage = NULL;
-        size_t          temp_storage_bytes = 0;
-        CubDebugExit(DeviceReduce::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            multi_dispatch.kernel_ptr,
-            single_dispatch.kernel_ptr,
-            FillAndResetDrainKernel<OffsetT>,
-            multi_dispatch.params,
-            single_dispatch.params,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op));
-        CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
-
-        // Warmup/correctness iteration
-        CubDebugExit(DeviceReduce::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            multi_dispatch.kernel_ptr,
-            single_dispatch.kernel_ptr,
-            FillAndResetDrainKernel<OffsetT>,
-            multi_dispatch.params,
-            single_dispatch.params,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op));
-
-        if (g_verify) CubDebugExit(cudaDeviceSynchronize());
-
-        // Copy out and display results
-        int compare = (g_verify) ?
-            CompareDeviceResults(h_reference, d_out, 1, true, false) :
-            0;
-
-        // Performance
-        GpuTimer gpu_timer;
-        float elapsed_millis = 0.0;
-        for (int i = 0; i < g_timing_iterations; i++)
-        {
-            gpu_timer.Start();
-
-            CubDebugExit(DeviceReduce::Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                multi_dispatch.kernel_ptr,
-                single_dispatch.kernel_ptr,
-                FillAndResetDrainKernel<OffsetT>,
-                multi_dispatch.params,
-                single_dispatch.params,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op));
-
-            gpu_timer.Stop();
-            elapsed_millis += gpu_timer.ElapsedMillis();
-        }
-
-        // Mooch
-        CubDebugExit(cudaDeviceSynchronize());
-
-        float avg_elapsed = elapsed_millis / g_timing_iterations;
-        float avg_throughput = float(num_items) / avg_elapsed / 1000.0 / 1000.0;
-        float avg_bandwidth = avg_throughput * sizeof(T);
-
-        multi_dispatch.avg_throughput = CUB_MAX(avg_throughput, multi_dispatch.avg_throughput);
-        if (avg_throughput > multi_dispatch.best_avg_throughput)
-        {
-            multi_dispatch.best_avg_throughput = avg_throughput;
-            multi_dispatch.best_size = num_items;
-        }
-
-        single_dispatch.avg_throughput = CUB_MAX(avg_throughput, single_dispatch.avg_throughput);
-        if (avg_throughput > single_dispatch.best_avg_throughput)
-        {
-            single_dispatch.best_avg_throughput = avg_throughput;
-            single_dispatch.best_size = num_items;
-        }
-
-        if (g_verbose)
-        {
-            printf("\t%.2f GB/s, multi_dispatch( ", avg_bandwidth);
-            multi_dispatch.params.Print();
-            printf(" ), single_dispatch( ");
-            single_dispatch.params.Print();
-            printf(" )\n");
-            fflush(stdout);
-        }
-
-        AssertEquals(0, compare);
-
-        // Cleanup temporaries
-        if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
-    }
-
-
-    /**
-     * Evaluate multi-block configurations
-     */
-    void TestMulti(
-        T*                      h_in,
-        T*                      d_in,
-        T*                      d_out,
-        ReductionOp             reduction_op)
-    {
-        // Simple single kernel tuple for use with multi kernel sweep
-        typedef typename DeviceReduce::TunedPolicies<T, OffsetT, TUNE_ARCH>::SinglePolicy SimpleSinglePolicy;
-        SingleDispatchTuple simple_single_tuple;
-        simple_single_tuple.params.template Init<SimpleSinglePolicy>();
-        simple_single_tuple.kernel_ptr = ReduceSingleKernel<SimpleSinglePolicy, T*, T*, OffsetT, ReductionOp>;
-
-        double max_exponent      = log2(double(g_max_items));
-        double min_exponent      = log2(double(simple_single_tuple.params.tile_size));
-        unsigned int max_int     = (unsigned int) -1;
-
-        for (int sample = 0; sample < g_samples; ++sample)
-        {
-            printf("\nMulti-block sample %d, ", sample);
-
-            int num_items;
-            if (sample == 0)
-            {
-                // First sample: use max items
-                num_items = g_max_items;
-                printf("num_items: %d", num_items); fflush(stdout);
-            }
-            else
-            {
-                // Sample a problem size from [2^g_min_exponent, g_max_items].  First 2/3 of the samples are log-distributed, the other 1/3 are uniformly-distributed.
-                unsigned int bits;
-                RandomBits(bits);
-                double scale = double(bits) / max_int;
-
-                if (sample < g_samples / 2)
-                {
-                    // log bias
-                    double exponent = ((max_exponent - min_exponent) * scale) + min_exponent;
-                    num_items = pow(2.0, exponent);
-                    num_items = CUB_MIN(num_items, g_max_items);
-                    printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
-                }
-                else
-                {
-                    // uniform bias
-                    num_items = CUB_MAX(pow(2.0, min_exponent), scale * g_max_items);
-                    num_items = CUB_MIN(num_items, g_max_items);
-                    printf("num_items: %d (%.2f * %d)", num_items, scale, g_max_items); fflush(stdout);
-                }
-            }
-            if (g_verbose)
-                printf("\n");
-            else
-                printf(", ");
-
-            // Compute reference
-            T h_reference = Reduce(h_in, reduction_op, num_items);
-
-            // Run test on each multi-kernel configuration
-            float best_avg_throughput = 0.0;
-            for (int j = 0; j < multi_kernels.size(); ++j)
-            {
-                multi_kernels[j].avg_throughput = 0.0;
-
-                TestConfiguration(multi_kernels[j], simple_single_tuple, d_in, d_out, &h_reference, num_items, reduction_op);
-
-                best_avg_throughput = CUB_MAX(best_avg_throughput, multi_kernels[j].avg_throughput);
-            }
-
-            // Print best throughput for this problem size
-            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
-
-            // Accumulate speedup (inverse for harmonic mean)
-            for (int j = 0; j < multi_kernels.size(); ++j)
-                multi_kernels[j].hmean_speedup += best_avg_throughput / multi_kernels[j].avg_throughput;
-        }
-
-        // Find max overall throughput and compute hmean speedups
-        float overall_max_throughput = 0.0;
-        for (int j = 0; j < multi_kernels.size(); ++j)
-        {
-            overall_max_throughput = CUB_MAX(overall_max_throughput, multi_kernels[j].best_avg_throughput);
-            multi_kernels[j].hmean_speedup = float(g_samples) / multi_kernels[j].hmean_speedup;
-        }
-
-        // Sort by cumulative speedup
-        sort(multi_kernels.begin(), multi_kernels.end(), MinSpeedup<MultiDispatchTuple>);
-
-        // Print ranked multi configurations
-        printf("\nRanked multi_kernels:\n");
-        for (int j = 0; j < multi_kernels.size(); ++j)
-        {
-            printf("\t (%d) params( ", multi_kernels.size() - j);
-            multi_kernels[j].params.Print();
-            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
-                multi_kernels[j].hmean_speedup,
-                multi_kernels[j].best_avg_throughput,
-                (int) multi_kernels[j].best_size,
-                multi_kernels[j].best_avg_throughput * sizeof(T),
-                multi_kernels[j].best_avg_throughput / overall_max_throughput);
-        }
-
-        printf("\nMax multi-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
-    }
-
-
-    /**
-     * Evaluate single-block configurations
-     */
-    void TestSingle(
-        T*                      h_in,
-        T*                      d_in,
-        T*                      d_out,
-        ReductionOp             reduction_op)
-     {
-        // Construct a NULL-ptr multi-kernel tuple that forces a single-kernel pass
-        MultiDispatchTuple multi_tuple;
-
-        double max_exponent     = log2(double(g_max_items));
-        unsigned int max_int    = (unsigned int) -1;
-
-        for (int sample = 0; sample < g_samples; ++sample)
-        {
-            printf("\nSingle-block sample %d, ", sample);
-
-            int num_items;
-            if (sample == 0)
-            {
-                // First sample: use max items
-                num_items = g_max_items;
-                printf("num_items: %d", num_items); fflush(stdout);
-            }
-            else
-            {
-                // Sample a problem size from [2, g_max_items], log-distributed
-                unsigned int bits;
-                RandomBits(bits);
-                double scale = double(bits) / max_int;
-                double exponent = ((max_exponent - 1) * scale) + 1;
-                num_items = pow(2.0, exponent);
-                printf("num_items: %d (2^%.2f)", num_items, exponent); fflush(stdout);
-            }
-
-            if (g_verbose)
-                printf("\n");
-            else
-                printf(", ");
-
-            // Compute reference
-            T h_reference = Reduce(h_in, reduction_op, num_items);
-
-            // Run test on each single-kernel configuration (pick first multi-config to use, which shouldn't be
-            float best_avg_throughput = 0.0;
-            for (int j = 0; j < single_kernels.size(); ++j)
-            {
-                single_kernels[j].avg_throughput = 0.0;
-
-                TestConfiguration(multi_tuple, single_kernels[j], d_in, d_out, &h_reference, num_items, reduction_op);
-
-                best_avg_throughput = CUB_MAX(best_avg_throughput, single_kernels[j].avg_throughput);
-            }
-
-            // Print best throughput for this problem size
-            printf("Best: %.2fe9 items/s (%.2f GB/s)\n", best_avg_throughput, best_avg_throughput * sizeof(T));
-
-            // Accumulate speedup (inverse for harmonic mean)
-            for (int j = 0; j < single_kernels.size(); ++j)
-                single_kernels[j].hmean_speedup += best_avg_throughput / single_kernels[j].avg_throughput;
-        }
-
-        // Find max overall throughput and compute hmean speedups
-        float overall_max_throughput = 0.0;
-        for (int j = 0; j < single_kernels.size(); ++j)
-        {
-            overall_max_throughput = CUB_MAX(overall_max_throughput, single_kernels[j].best_avg_throughput);
-            single_kernels[j].hmean_speedup = float(g_samples) / single_kernels[j].hmean_speedup;
-        }
-
-        // Sort by cumulative speedup
-        sort(single_kernels.begin(), single_kernels.end(), MinSpeedup<SingleDispatchTuple>);
-
-        // Print ranked single configurations
-        printf("\nRanked single_kernels:\n");
-        for (int j = 0; j < single_kernels.size(); ++j)
-        {
-            printf("\t (%d) params( ", single_kernels.size() - j);
-            single_kernels[j].params.Print();
-            printf(" ) hmean speedup: %.3f, best throughput %.2f @ %d elements (%.2f GB/s, %.2f%%)\n",
-                single_kernels[j].hmean_speedup,
-                single_kernels[j].best_avg_throughput,
-                (int) single_kernels[j].best_size,
-                single_kernels[j].best_avg_throughput * sizeof(T),
-                single_kernels[j].best_avg_throughput / overall_max_throughput);
-        }
-
-        printf("\nMax single-block throughput %.2f (%.2f GB/s)\n", overall_max_throughput, overall_max_throughput * sizeof(T));
-    }
-
-};
-
-
-
-//---------------------------------------------------------------------
-// Main
-//---------------------------------------------------------------------
-
-/**
- * Main
- */
-int main(int argc, char** argv)
-{
-    // Initialize command line
-    CommandLineArgs args(argc, argv);
-    args.GetCmdLineArgument("n", g_max_items);
-    args.GetCmdLineArgument("s", g_samples);
-    args.GetCmdLineArgument("i", g_timing_iterations);
-    g_verbose = args.CheckCmdLineFlag("v");
-    g_single = args.CheckCmdLineFlag("single");
-    g_verify = !args.CheckCmdLineFlag("noverify");
-
-    // Print usage
-    if (args.CheckCmdLineFlag("help"))
-    {
-        printf("%s "
-            "[--device=<device-id>] "
-            "[--n=<max items>]"
-            "[--s=<samples>]"
-            "[--i=<timing iterations>]"
-            "[--single]"
-            "[--v]"
-            "[--noverify]"
-            "\n", argv[0]);
-        exit(0);
-    }
-
-    // Initialize device
-    CubDebugExit(args.DeviceInit());
-
-#if (TUNE_SIZE == 1)
-    typedef unsigned char T;
-#elif (TUNE_SIZE == 2)
-    typedef unsigned short T;
-#elif (TUNE_SIZE == 4)
-    typedef unsigned int T;
-#elif (TUNE_SIZE == 8)
-    typedef unsigned long long T;
-#else
-    // Default
-    typedef unsigned int T;
-#endif
-
-    typedef unsigned int OffsetT;
-    Sum reduction_op;
-
-    // Enumerate kernels
-    Schmoo<T, OffsetT, Sum > schmoo;
-    schmoo.Enumerate();
-
-    // Allocate host arrays
-    T *h_in = new T[g_max_items];
-
-    // Initialize problem
-    Initialize(UNIFORM, h_in, g_max_items);
-
-    // Initialize device arrays
-    T *d_in = NULL;
-    T *d_out = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * g_max_items));
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * 1));
-    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * g_max_items, cudaMemcpyHostToDevice));
-
-    // Test kernels
-    if (g_single)
-        schmoo.TestSingle(h_in, d_in, d_out, reduction_op);
-    else
-        schmoo.TestMulti(h_in, d_in, d_out, reduction_op);
-
-    // Cleanup
-    if (h_in) delete[] h_in;
-    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
-    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
-
-    return 0;
-}
-
-
-

From b0e8700c3567611ced9e97e28a1ec0dee47df97a Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Fri, 1 Feb 2019 15:05:31 -0700
Subject: [PATCH 5/6] Added RMM hooks in for BFS

---
 src/bfs.cu | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/bfs.cu b/src/bfs.cu
index aa5490460fb..9f10ca3239b 100644
--- a/src/bfs.cu
+++ b/src/bfs.cu
@@ -13,6 +13,7 @@
 #include <iomanip>
 #include "bfs.cuh"
 #include <limits>
+#include "rmm_utils.h"
 
 #include "graph_utils.cuh"
 #include "bfs_kernels.cuh"
@@ -31,7 +32,7 @@ namespace cugraph {
 		deterministic = false;
 		//Working data
 		//Each vertex can be in the frontier at most once
-		cudaMalloc(&frontier, n * sizeof(IndexType));
+		ALLOC_MANAGED_TRY(&frontier, n * sizeof(IndexType), nullptr)
 
 		//We will update frontier during the execution
 		//We need the orig to reset frontier, or cudaFree
@@ -40,20 +41,20 @@ namespace cugraph {
 		//size of bitmaps for vertices
 		vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
 		//ith bit of visited_bmap is set <=> ith vertex is visited
-		cudaMalloc(&visited_bmap, sizeof(int) * vertices_bmap_size);
+		ALLOC_MANAGED_TRY(&visited_bmap, sizeof(int) * vertices_bmap_size, nullptr);
 
 		//ith bit of isolated_bmap is set <=> degree of ith vertex = 0
-		cudaMalloc(&isolated_bmap, sizeof(int) * vertices_bmap_size);
+		ALLOC_MANAGED_TRY(&isolated_bmap, sizeof(int) * vertices_bmap_size, nullptr);
 
 		//vertices_degree[i] = degree of vertex i
-		cudaMalloc(&vertex_degree, sizeof(IndexType) * n);
+		ALLOC_MANAGED_TRY(&vertex_degree, sizeof(IndexType) * n, nullptr);
 
 		//Cub working data
 		cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
 
 		//We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
-		cudaMalloc(&buffer_np1_1, (n + 1) * sizeof(IndexType));
-		cudaMalloc(&buffer_np1_2, (n + 1) * sizeof(IndexType));
+		ALLOC_MANAGED_TRY(&buffer_np1_1, (n + 1) * sizeof(IndexType), nullptr);
+		ALLOC_MANAGED_TRY(&buffer_np1_2, (n + 1) * sizeof(IndexType), nullptr);
 
 		//Using buffers : top down
 
@@ -74,13 +75,13 @@ namespace cugraph {
 
 		//We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
 		//See top down kernels for more details
-		cudaMalloc(&exclusive_sum_frontier_vertex_buckets_offsets,
-						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType));
+		ALLOC_MANAGED_TRY(&exclusive_sum_frontier_vertex_buckets_offsets,
+						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), nullptr);
 
 		//Init device-side counters
 		//Those counters must be/can be reset at each bfs iteration
 		//Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
-		cudaMalloc(&d_counters_pad, 4 * sizeof(IndexType));
+		ALLOC_MANAGED_TRY(&d_counters_pad, 4 * sizeof(IndexType), nullptr);
 
 		d_new_frontier_cnt = &d_counters_pad[0];
 		d_mu = &d_counters_pad[1];
@@ -116,7 +117,7 @@ namespace cugraph {
 
 		//We need distances to use bottom up
 		if (directed && !computeDistances)
-			cudaMalloc(&distances, n * sizeof(IndexType));
+			ALLOC_MANAGED_TRY(&distances, n * sizeof(IndexType), nullptr);
 	}
 
 	template<typename IndexType>
@@ -451,19 +452,19 @@ namespace cugraph {
 	template<typename IndexType>
 	void Bfs<IndexType>::clean() {
 		//the vectors have a destructor that takes care of cleaning
-		cudaFree(original_frontier);
-		cudaFree(visited_bmap);
-		cudaFree(isolated_bmap);
-		cudaFree(vertex_degree);
-		cudaFree(d_cub_exclusive_sum_storage);
-		cudaFree(buffer_np1_1);
-		cudaFree(buffer_np1_2);
-		cudaFree(exclusive_sum_frontier_vertex_buckets_offsets);
-		cudaFree(d_counters_pad);
+		ALLOC_FREE_TRY(original_frontier, nullptr);
+		ALLOC_FREE_TRY(visited_bmap, nullptr);
+		ALLOC_FREE_TRY(isolated_bmap, nullptr);
+		ALLOC_FREE_TRY(vertex_degree, nullptr);
+		ALLOC_FREE_TRY(d_cub_exclusive_sum_storage, nullptr);
+		ALLOC_FREE_TRY(buffer_np1_1, nullptr);
+		ALLOC_FREE_TRY(buffer_np1_2, nullptr);
+		ALLOC_FREE_TRY(exclusive_sum_frontier_vertex_buckets_offsets, nullptr);
+		ALLOC_FREE_TRY(d_counters_pad, nullptr);
 
 		//In that case, distances is a working data
 		if (directed && !computeDistances)
-			cudaFree(distances);
+			ALLOC_FREE_TRY(distances, nullptr);
 	}
 
 	template class Bfs<int> ;

From 248552ee319245252c523f8a1cc54d57872c13f2 Mon Sep 17 00:00:00 2001
From: James Wyles <jwyles@nvidia.com>
Date: Fri, 1 Feb 2019 15:25:16 -0700
Subject: [PATCH 6/6] Added semicolon for consistency

---
 src/bfs.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bfs.cu b/src/bfs.cu
index 9f10ca3239b..903a514018d 100644
--- a/src/bfs.cu
+++ b/src/bfs.cu
@@ -32,7 +32,7 @@ namespace cugraph {
 		deterministic = false;
 		//Working data
 		//Each vertex can be in the frontier at most once
-		ALLOC_MANAGED_TRY(&frontier, n * sizeof(IndexType), nullptr)
+		ALLOC_MANAGED_TRY(&frontier, n * sizeof(IndexType), nullptr);
 
 		//We will update frontier during the execution
 		//We need the orig to reset frontier, or cudaFree